diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 14a90d7ccc2d2e..7c49f13610b983 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -59,7 +59,7 @@ endif() # LLVM_EXTERNAL_${project}_SOURCE_DIR using LLVM_ALL_PROJECTS # This allows an easy way of setting up a build directory for llvm and another # one for llvm+clang+... using the same sources. -set(LLVM_ALL_PROJECTS "clang;clang-tools-extra;compiler-rt;debuginfo-tests;libclc;libcxx;libcxxabi;libunwind;lld;lldb;llgo;openmp;parallel-libs;polly;pstl") +set(LLVM_ALL_PROJECTS "clang;clang-tools-extra;compiler-rt;debuginfo-tests;libc;libclc;libcxx;libcxxabi;libunwind;lld;lldb;llgo;openmp;parallel-libs;polly;pstl") set(LLVM_ENABLE_PROJECTS "" CACHE STRING "Semicolon-separated list of projects to build (${LLVM_ALL_PROJECTS}), or \"all\".") if( LLVM_ENABLE_PROJECTS STREQUAL "all" ) @@ -1097,3 +1097,7 @@ if (LLVM_INCLUDE_BENCHMARKS) add_subdirectory(utils/benchmark) add_subdirectory(benchmarks) endif() + +if (LLVM_INCLUDE_UTILS AND LLVM_INCLUDE_TOOLS) + add_subdirectory(utils/llvm-locstats) +endif() diff --git a/llvm/README.txt b/llvm/README.txt index 4a9bbedf365318..45356d05747ea4 100644 --- a/llvm/README.txt +++ b/llvm/README.txt @@ -16,5 +16,6 @@ documentation setup. If you are writing a package for LLVM, see docs/Packaging.rst for our suggestions. + Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. Notified per clause 4(b) of the license. diff --git a/llvm/cmake/modules/AddSphinxTarget.cmake b/llvm/cmake/modules/AddSphinxTarget.cmake index 22e3dcb776aa22..2bf654b60c444d 100644 --- a/llvm/cmake/modules/AddSphinxTarget.cmake +++ b/llvm/cmake/modules/AddSphinxTarget.cmake @@ -71,6 +71,11 @@ function (add_sphinx_target builder project) COMPONENT "${project}-sphinx-man" DESTINATION ${INSTALL_MANDIR}man1) + if(NOT LLVM_ENABLE_IDE) + add_llvm_install_targets("install-${SPHINX_TARGET_NAME}" + DEPENDS ${SPHINX_TARGET_NAME} + COMPONENT "${project}-sphinx-man") + endif() elseif (builder STREQUAL html) string(TOUPPER "${project}" project_upper) set(${project_upper}_INSTALL_SPHINX_HTML_DIR "share/doc/${project}/html" @@ -82,6 +87,12 @@ function (add_sphinx_target builder project) install(DIRECTORY "${SPHINX_BUILD_DIR}/." COMPONENT "${project}-sphinx-html" DESTINATION "${${project_upper}_INSTALL_SPHINX_HTML_DIR}") + + if(NOT LLVM_ENABLE_IDE) + add_llvm_install_targets("install-${SPHINX_TARGET_NAME}" + DEPENDS ${SPHINX_TARGET_NAME} + COMPONENT "${project}-sphinx-html") + endif() else() message(WARNING Installation of ${builder} not supported) endif() diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake index 71dfebb12b44f3..a1c28870c144c9 100644 --- a/llvm/cmake/modules/TableGen.cmake +++ b/llvm/cmake/modules/TableGen.cmake @@ -23,7 +23,7 @@ function(tablegen project ofn) file(RELATIVE_PATH ofn_rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${ofn}) set(additional_cmdline - -o ${ofn_rel}.tmp + -o ${ofn_rel} -d ${ofn_rel}.d WORKING_DIRECTORY ${CMAKE_BINARY_DIR} DEPFILE ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.d @@ -34,7 +34,7 @@ function(tablegen project ofn) file(GLOB local_tds "*.td") file(GLOB_RECURSE global_tds "${LLVM_MAIN_INCLUDE_DIR}/llvm/*.td") set(additional_cmdline - -o ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp + -o ${CMAKE_CURRENT_BINARY_DIR}/${ofn} ) endif() @@ -58,6 +58,15 @@ function(tablegen project ofn) endif() endif() + if (CMAKE_GENERATOR MATCHES "Visual Studio") + # Visual Studio has problems with llvm-tblgen's native --write-if-changed + # behavior. Since it doesn't do restat optimizations anyway, just don't + # pass --write-if-changed there. + set(tblgen_change_flag) + else() + set(tblgen_change_flag "--write-if-changed") + endif() + # We need both _TABLEGEN_TARGET and _TABLEGEN_EXE in the DEPENDS list # (both the target and the file) to have .inc files rebuilt on # a tablegen change, as cmake does not propagate file-level dependencies @@ -67,11 +76,11 @@ function(tablegen project ofn) # dependency twice in the result file when # ("${${project}_TABLEGEN_TARGET}" STREQUAL "${${project}_TABLEGEN_EXE}") # but lets us having smaller and cleaner code here. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp - # Generate tablegen output in a temporary file. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} COMMAND ${${project}_TABLEGEN_EXE} ${ARGN} -I ${CMAKE_CURRENT_SOURCE_DIR} ${LLVM_TABLEGEN_FLAGS} ${LLVM_TARGET_DEFINITIONS_ABSOLUTE} + ${tblgen_change_flag} ${additional_cmdline} # The file in LLVM_TARGET_DEFINITIONS may be not in the current # directory and local_tds may not contain it, so we must @@ -81,20 +90,9 @@ function(tablegen project ofn) ${LLVM_TARGET_DEFINITIONS_ABSOLUTE} COMMENT "Building ${ofn}..." ) - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn} - # Only update the real output file if there are any differences. - # This prevents recompilation of all the files depending on it if there - # aren't any. - COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp - ${CMAKE_CURRENT_BINARY_DIR}/${ofn} - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp - COMMENT "Updating ${ofn}..." - ) # `make clean' must remove all those generated files: - set_property(DIRECTORY APPEND - PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${ofn}.tmp ${ofn}) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${ofn}) set(TABLEGEN_OUTPUT ${TABLEGEN_OUTPUT} ${CMAKE_CURRENT_BINARY_DIR}/${ofn} PARENT_SCOPE) set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${ofn} PROPERTIES @@ -171,7 +169,13 @@ macro(add_tablegen target project) install(TARGETS ${target} ${export_to_llvmexports} + COMPONENT ${target} RUNTIME DESTINATION ${LLVM_TOOLS_INSTALL_DIR}) + if(NOT LLVM_ENABLE_IDE) + add_llvm_install_targets("install-${target}" + DEPENDS ${target} + COMPONENT ${target}) + endif() endif() set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${target}) endmacro() diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst index 52ad316459d4ca..77fece030eb603 100644 --- a/llvm/docs/CommandGuide/index.rst +++ b/llvm/docs/CommandGuide/index.rst @@ -74,3 +74,4 @@ Developer Tools llvm-build llvm-exegesis llvm-pdbutil + llvm-locstats diff --git a/llvm/docs/CommandGuide/llvm-locstats.rst b/llvm/docs/CommandGuide/llvm-locstats.rst new file mode 100644 index 00000000000000..8b99917a1ff4ae --- /dev/null +++ b/llvm/docs/CommandGuide/llvm-locstats.rst @@ -0,0 +1,79 @@ +llvm-locstats - calculate statistics on DWARF debug location +============================================================ + +.. program:: llvm-locstats + +SYNOPSIS +-------- + +:program:`llvm-locstats` [*options*] [*filename*] + +DESCRIPTION +----------- + +:program:`llvm-locstats` works like a wrapper around :program:`llvm-dwarfdump`. +It parses :program:`llvm-dwarfdump` statistics regarding debug location by +pretty printing it in a more human readable way. + +The line 0% shows the number and the percentage of DIEs with no location +information, but the line 100% shows the information for DIEs where there is +location information in all code section bytes (where the variable or parameter +is in the scope). The line 50-59% shows the number and the percentage of DIEs +where the location information is between 50 and 59 percentage of its scope +covered. + +OPTIONS +------- + +.. option:: -only-variables + + Calculate the location statistics only for local variables. + +.. option:: -only-formal-parameters + + Calculate the location statistics only for formal parameters. + +.. option:: -ignore-debug-entry-values + + Ignore the location statistics on locations containing the + debug entry values DWARF operation. + +EXIT STATUS +----------- + +:program:`llvm-locstats` returns 0 if the input file were parsed +successfully. Otherwise, it returns 1. + +OUTPUT EXAMPLE +-------------- + +.. code-block:: none + + ================================================= + Debug Location Statistics + ================================================= + cov% samples percentage(~) + ------------------------------------------------- + 0% 1 16% + 1-9% 0 0% + 10-19% 0 0% + 20-29% 0 0% + 30-39% 0 0% + 40-49% 0 0% + 50-59% 1 16% + 60-69% 0 0% + 70-79% 0 0% + 80-89% 1 16% + 90-99% 0 0% + 100% 3 50% + ================================================= + -the number of debug variables processed: 6 + -PC ranges covered: 81% + ------------------------------------------------- + -total availability: 83% + ================================================= + +SEE ALSO +-------- + +:manpage:`llvm-dwarfdump(1)` diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst index 35e4d421f0b15e..ccdcf13eb597fc 100644 --- a/llvm/docs/CommandGuide/llvm-objcopy.rst +++ b/llvm/docs/CommandGuide/llvm-objcopy.rst @@ -82,6 +82,11 @@ multiple file formats. Remove the specified section from the output. Can be specified multiple times to remove multiple sections simultaneously. +.. option:: --set-section-alignment
= + + Set the alignment of section ``
`` to ```. Can be specified + multiple times to update multiple sections. + .. option:: --strip-all-gnu Remove all symbols, debug sections and relocations from the output. This option diff --git a/llvm/docs/FAQ.rst b/llvm/docs/FAQ.rst index 2c69abfdd0bc57..1afba7557bd732 100644 --- a/llvm/docs/FAQ.rst +++ b/llvm/docs/FAQ.rst @@ -9,17 +9,10 @@ Frequently Asked Questions (FAQ) License ======= -Does the University of Illinois Open Source License really qualify as an "open source" license? ------------------------------------------------------------------------------------------------ -Yes, the license is `certified -`_ by the Open Source -Initiative (OSI). - - Can I modify LLVM source code and redistribute the modified source? ------------------------------------------------------------------- Yes. The modified source distribution must retain the copyright notice and -follow the three bulleted conditions listed in the `LLVM license +follow the conditions listed in the `LLVM license `_. @@ -41,10 +34,12 @@ the STL. How portable is the LLVM source code? ------------------------------------- The LLVM source code should be portable to most modern Unix-like operating -systems. Most of the code is written in standard C++ with operating system +systems. LLVM has also excellent support on Windows systems. +Most of the code is written in standard C++ with operating system services abstracted to a support library. The tools required to build and test LLVM have been ported to a plethora of platforms. + What API do I use to store a value to one of the virtual registers in LLVM IR's SSA representation? --------------------------------------------------------------------------------------------------- diff --git a/llvm/docs/GettingStartedTutorials.rst b/llvm/docs/GettingStartedTutorials.rst new file mode 100644 index 00000000000000..a8d60343ddc5b1 --- /dev/null +++ b/llvm/docs/GettingStartedTutorials.rst @@ -0,0 +1,34 @@ +Getting Started/Tutorials +========================= + +For those new to the LLVM system. + +.. toctree:: + :hidden: + + Frontend/PerformanceTips + GettingStarted + GettingStartedVS + ProgrammersManual + tutorial/index + +:doc:`GettingStarted` + Discusses how to get up and running quickly with the LLVM infrastructure. + Everything from unpacking and compilation of the distribution to execution + of some tools. + +:doc:`tutorial/index` + Tutorials about using LLVM. Includes a tutorial about making a custom + language with LLVM. + +:doc:`ProgrammersManual` + Introduction to the general layout of the LLVM sourcebase, important classes + and APIs, and some tips & tricks. + +:doc:`Frontend/PerformanceTips` + A collection of tips for frontend authors on how to generate IR + which LLVM is able to effectively optimize. + +:doc:`GettingStartedVS` + An addendum to the main Getting Started guide for those using Visual Studio + on Windows. \ No newline at end of file diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 2d4c57b7968118..d9a38907c920a1 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15940,6 +15940,102 @@ mode is determined by the runtime floating-point environment. The rounding mode argument is only intended as information to the compiler. +'``llvm.experimental.constrained.lrint``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.lrint( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.lrint``' intrinsic returns the first +operand rounded to the nearest integer. An inexact floating-point exception +will be raised if the operand is not an integer. An invalid exception is +raised if the result is too large to fit into a supported integer type, +and in this case the result is undefined. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. Not all types are supported on all targets. The supported +types are the same as the ``llvm.lrint`` intrinsic and the ``lrint`` +libm functions. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``lrint`` functions +would, and handles error conditions in the same way. + +The rounding mode is described, not determined, by the rounding mode +argument. The actual rounding mode is determined by the runtime floating-point +environment. The rounding mode argument is only intended as information +to the compiler. + +If the runtime floating-point environment is using the default rounding mode +then the results will be the same as the llvm.lrint intrinsic. + + +'``llvm.experimental.constrained.llrint``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.llrint( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.llrint``' intrinsic returns the first +operand rounded to the nearest integer. An inexact floating-point exception +will be raised if the operand is not an integer. An invalid exception is +raised if the result is too large to fit into a supported integer type, +and in this case the result is undefined. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. Not all types are supported on all targets. The supported +types are the same as the ``llvm.llrint`` intrinsic and the ``llrint`` +libm functions. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``llrint`` functions +would, and handles error conditions in the same way. + +The rounding mode is described, not determined, by the rounding mode +argument. The actual rounding mode is determined by the runtime floating-point +environment. The rounding mode argument is only intended as information +to the compiler. + +If the runtime floating-point environment is using the default rounding mode +then the results will be the same as the llvm.llrint intrinsic. + + '``llvm.experimental.constrained.nearbyint``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -16162,6 +16258,82 @@ This function returns the same values as the libm ``round`` functions would and handles error conditions in the same way. +'``llvm.experimental.constrained.lround``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.lround( , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.lround``' intrinsic returns the first +operand rounded to the nearest integer with ties away from zero. It will +raise an inexact floating-point exception if the operand is not an integer. +An invalid exception is raised if the result is too large to fit into a +supported integer type, and in this case the result is undefined. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. Not all types are supported on all targets. The supported +types are the same as the ``llvm.lround`` intrinsic and the ``lround`` +libm functions. + +The second argument specifies the exception behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``lround`` functions +would and handles error conditions in the same way. + + +'``llvm.experimental.constrained.llround``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.llround( , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.llround``' intrinsic returns the first +operand rounded to the nearest integer with ties away from zero. It will +raise an inexact floating-point exception if the operand is not an integer. +An invalid exception is raised if the result is too large to fit into a +supported integer type, and in this case the result is undefined. + +Arguments: +"""""""""" + +The first argument is a floating-point number. The return value is an +integer type. Not all types are supported on all targets. The supported +types are the same as the ``llvm.llround`` intrinsic and the ``llround`` +libm functions. + +The second argument specifies the exception behavior as described above. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``llround`` functions +would and handles error conditions in the same way. + + '``llvm.experimental.constrained.trunc``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/ProgrammingDocumentation.rst b/llvm/docs/ProgrammingDocumentation.rst deleted file mode 100644 index 6a4d7aa25ba786..00000000000000 --- a/llvm/docs/ProgrammingDocumentation.rst +++ /dev/null @@ -1,67 +0,0 @@ -Programming Documentation -========================= - -For developers of applications which use LLVM as a library. - -.. toctree:: - :hidden: - - Atomics - CommandLine - CommandGuide/index - ExtendingLLVM - HowToSetUpLLVMStyleRTTI - ProgrammersManual - Extensions - LibFuzzer - FuzzingLLVM - ScudoHardenedAllocator - OptBisect - GwpAsan - -:doc:`Atomics` - Information about LLVM's concurrency model. - -:doc:`ProgrammersManual` - Introduction to the general layout of the LLVM sourcebase, important classes - and APIs, and some tips & tricks. - -:doc:`Extensions` - LLVM-specific extensions to tools and formats LLVM seeks compatibility with. - -:doc:`HowToSetUpLLVMStyleRTTI` - How to make ``isa<>``, ``dyn_cast<>``, etc. available for clients of your - class hierarchy. - -:doc:`ExtendingLLVM` - Look here to see how to add instructions and intrinsics to LLVM. - -:doc:`ScudoHardenedAllocator` - A library that implements a security-hardened `malloc()`. - -:doc:`GwpAsan` - A sampled heap memory error detection toolkit designed for production use. - -============ -Command Line -============ - -:doc:`CommandLine` - Provides information on using the command line parsing library. - -:doc:`OptBisect` - A command line option for debugging optimization-induced failures. - -:doc:`LLVM Command Guide ` - A reference manual for the LLVM command line utilities ("man" pages for LLVM - tools). - -========= -LibFuzzer -========= - -:doc:`LibFuzzer` - A library for writing in-process guided fuzzers. - -:doc:`FuzzingLLVM` - Information on writing and using Fuzzers to find bugs in LLVM. \ No newline at end of file diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index 49ff6b6e0e2dba..a24a8a1eade5a6 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -3,45 +3,217 @@ Reference LLVM and API reference documentation. +.. contents:: + :local: + .. toctree:: :hidden: - LangRef - TestingGuide + Atomics + BitCodeFormat + BlockFrequencyTerminology + BranchWeightMetadata + Bugpoint + CFIVerify + CommandGuide/index CompilerWriterInfo + Coroutines + DependenceGraphs/index + ExceptionHandling + Extensions + FaultMaps + FuzzingLLVM + GarbageCollection + GetElementPtr + GlobalISel + GwpAsan + HowToSetUpLLVMStyleRTTI + HowToUseAttributes + InAlloca + LangRef + LibFuzzer + MarkedUpDisassembly + MemorySSA MIRLangRef - NVPTXUsage - AMDGPUUsage + OptBisect + ORCv2 + PDB/index + ScudoHardenedAllocator + SegmentedStacks + StackMaps + SpeculativeLoadHardening + SupportLibrary + Statepoints + SystemLibrary + TestingGuide + TransformMetadata + TypeMetadata + XRay + XRayExample + XRayFDRFormat + YamlIO + +API Reference +------------- + +`Doxygen generated documentation `_ + (`classes `_) + +`Documentation for Go bindings `_ + +:doc:`ORCv2` + Describes the design and implementation of the ORC APIs, including some + usage examples, and a guide for users transitioning from ORCv1 to ORCv2. -============== LLVM Reference -============== +-------------- :doc:`LLVM Language Reference Manual ` Defines the LLVM intermediate representation and the assembly form of the different nodes. -:doc:`LLVM Testing Infrastructure Guide ` - A reference manual for using the LLVM testing infrastructure. +:doc:`FaultMaps` + LLVM support for folding control flow into faulting machine instructions. -:doc:`CompilerWriterInfo` - A list of helpful links for compiler writers. +:doc:`InAlloca` + Description of the ``inalloca`` argument attribute. :doc:`Machine IR (MIR) Format Reference Manual ` A reference manual for the MIR serialization format, which is used to test LLVM's code generation passes. -:doc:`NVPTXUsage` - This document describes using the NVPTX backend to compile GPU kernels. +:doc:`GlobalISel` + This describes the prototype instruction selection replacement, GlobalISel. -:doc:`AMDGPUUsage` - This document describes using the AMDGPU backend to compile GPU kernels. +:doc:`Atomics` + Information about LLVM's concurrency model. -============= -API Reference -============= +:doc:`ExceptionHandling` + This document describes the design and implementation of exception handling + in LLVM. -`Doxygen generated documentation `_ - (`classes `_) +:doc:`CompilerWriterInfo` + A list of helpful links for compiler writers. -`Documentation for Go bindings `_ +:doc:`BitCodeFormat` + This describes the file format and encoding used for LLVM "bc" files. + +:doc:`Extensions` + LLVM-specific extensions to tools and formats LLVM seeks compatibility with. + +:doc:`HowToSetUpLLVMStyleRTTI` + How to make ``isa<>``, ``dyn_cast<>``, etc. available for clients of your + class hierarchy. + +:doc:`BlockFrequencyTerminology` + Provides information about terminology used in the ``BlockFrequencyInfo`` + analysis pass. + +:doc:`BranchWeightMetadata` + Provides information about Branch Prediction Information. + +:doc:`MemorySSA` + Information about the MemorySSA utility in LLVM, as well as how to use it. + +:doc:`Support Library ` + This document describes the LLVM Support Library (``lib/Support``) and + how to keep LLVM source code portable + +:doc:`GetElementPtr` + Answers to some very frequent questions about LLVM's most frequently + misunderstood instruction. + +:doc:`ScudoHardenedAllocator` + A library that implements a security-hardened `malloc()`. + +:doc:`GwpAsan` + A sampled heap memory error detection toolkit designed for production use. + +:doc:`Dependence Graphs ` + A description of the design of the various dependence graphs such as + the DDG (Data Dependence Graph). + +:doc:`CFIVerify` + A description of the verification tool for Control Flow Integrity. + +:doc:`SpeculativeLoadHardening` + A description of the Speculative Load Hardening mitigation for Spectre v1. + +:doc:`SegmentedStacks` + This document describes segmented stacks and how they are used in LLVM. + +:doc:`MarkedUpDisassembly` + This document describes the optional rich disassembly output syntax. + +:doc:`HowToUseAttributes` + Answers some questions about the new Attributes infrastructure. + +:doc:`StackMaps` + LLVM support for mapping instruction addresses to the location of + values and allowing code to be patched. + +:doc:`Coroutines` + LLVM support for coroutines. + +:doc:`YamlIO` + A reference guide for using LLVM's YAML I/O library. + +====================== +Command Line Utilities +====================== + +:doc:`LLVM Command Guide ` + A reference manual for the LLVM command line utilities ("man" pages for LLVM + tools). + +:doc:`Bugpoint` + Automatic bug finder and test-case reducer description and usage + information. + +:doc:`OptBisect` + A command line option for debugging optimization-induced failures. + +:doc:`The Microsoft PDB File Format ` + A detailed description of the Microsoft PDB (Program Database) file format. + +================== +Garbage Collection +================== + +:doc:`GarbageCollection` + The interfaces source-language compilers should use for compiling GC'd + programs. + +:doc:`Statepoints` + This describes a set of experimental extensions for garbage + collection support. + +========= +LibFuzzer +========= + +:doc:`LibFuzzer` + A library for writing in-process guided fuzzers. + +:doc:`FuzzingLLVM` + Information on writing and using Fuzzers to find bugs in LLVM. + +======= +Testing +======= + +:doc:`LLVM Testing Infrastructure Guide ` + A reference manual for using the LLVM testing infrastructure. + +:doc:`TestSuiteGuide` + Describes how to compile and run the test-suite benchmarks. + +==== +XRay +==== + +:doc:`XRay` + High-level documentation of how to use XRay in LLVM. + +:doc:`XRayExample` + An example of how to debug an application with XRay. \ No newline at end of file diff --git a/llvm/docs/SubsystemDocumentation.rst b/llvm/docs/SubsystemDocumentation.rst deleted file mode 100644 index 69764cbd2822ab..00000000000000 --- a/llvm/docs/SubsystemDocumentation.rst +++ /dev/null @@ -1,202 +0,0 @@ -.. _index-subsystem-docs: - -Subsystem Documentation -======================= - -For API clients and LLVM developers. - -.. toctree:: - :hidden: - - AliasAnalysis - MemorySSA - BitCodeFormat - BlockFrequencyTerminology - BranchWeightMetadata - Bugpoint - CodeGenerator - ExceptionHandling - AddingConstrainedIntrinsics - LinkTimeOptimization - SegmentedStacks - TableGenFundamentals - TableGen/index - DebuggingJITedCode - GoldPlugin - MarkedUpDisassembly - SystemLibrary - SupportLibrary - SourceLevelDebugging - Vectorizers - WritingAnLLVMBackend - GarbageCollection - WritingAnLLVMPass - HowToUseAttributes - StackMaps - InAlloca - BigEndianNEON - CoverageMappingFormat - Statepoints - MergeFunctions - TypeMetadata - TransformMetadata - FaultMaps - Coroutines - GlobalISel - XRay - XRayExample - XRayFDRFormat - PDB/index - CFIVerify - SpeculativeLoadHardening - StackSafetyAnalysis - LoopTerminology - DependenceGraphs/index - -:doc:`WritingAnLLVMPass` - Information on how to write LLVM transformations and analyses. - -:doc:`WritingAnLLVMBackend` - Information on how to write LLVM backends for machine targets. - -:doc:`CodeGenerator` - The design and implementation of the LLVM code generator. Useful if you are - working on retargetting LLVM to a new architecture, designing a new codegen - pass, or enhancing existing components. - -:doc:`TableGen ` - Describes the TableGen tool, which is used heavily by the LLVM code - generator. - -:doc:`AliasAnalysis` - Information on how to write a new alias analysis implementation or how to - use existing analyses. - -:doc:`MemorySSA` - Information about the MemorySSA utility in LLVM, as well as how to use it. - -:doc:`Source Level Debugging with LLVM ` - This document describes the design and philosophy behind the LLVM - source-level debugger. - -:doc:`Vectorizers` - This document describes the current status of vectorization in LLVM. - -:doc:`ExceptionHandling` - This document describes the design and implementation of exception handling - in LLVM. - -:doc:`AddingConstrainedIntrinsics` - Gives the steps necessary when adding a new constrained math intrinsic - to LLVM. - -:doc:`Bugpoint` - Automatic bug finder and test-case reducer description and usage - information. - -:doc:`BitCodeFormat` - This describes the file format and encoding used for LLVM "bc" files. - -:doc:`Support Library ` - This document describes the LLVM Support Library (``lib/Support``) and - how to keep LLVM source code portable - -:doc:`LinkTimeOptimization` - This document describes the interface between LLVM intermodular optimizer - and the linker and its design - -:doc:`GoldPlugin` - How to build your programs with link-time optimization on Linux. - -:doc:`DebuggingJITedCode` - How to debug JITed code with GDB. - -:doc:`MCJITDesignAndImplementation` - Describes the inner workings of MCJIT execution engine. - -:doc:`ORCv2` - Describes the design and implementation of the ORC APIs, including some - usage examples, and a guide for users transitioning from ORCv1 to ORCv2. - -:doc:`BranchWeightMetadata` - Provides information about Branch Prediction Information. - -:doc:`BlockFrequencyTerminology` - Provides information about terminology used in the ``BlockFrequencyInfo`` - analysis pass. - -:doc:`SegmentedStacks` - This document describes segmented stacks and how they are used in LLVM. - -:doc:`MarkedUpDisassembly` - This document describes the optional rich disassembly output syntax. - -:doc:`HowToUseAttributes` - Answers some questions about the new Attributes infrastructure. - -:doc:`StackMaps` - LLVM support for mapping instruction addresses to the location of - values and allowing code to be patched. - -:doc:`BigEndianNEON` - LLVM's support for generating NEON instructions on big endian ARM targets is - somewhat nonintuitive. This document explains the implementation and rationale. - -:doc:`CoverageMappingFormat` - This describes the format and encoding used for LLVM’s code coverage mapping. - -:doc:`MergeFunctions` - Describes functions merging optimization. - -:doc:`InAlloca` - Description of the ``inalloca`` argument attribute. - -:doc:`FaultMaps` - LLVM support for folding control flow into faulting machine instructions. - -:doc:`CompileCudaWithLLVM` - LLVM support for CUDA. - -:doc:`Coroutines` - LLVM support for coroutines. - -:doc:`GlobalISel` - This describes the prototype instruction selection replacement, GlobalISel. - -:doc:`XRay` - High-level documentation of how to use XRay in LLVM. - -:doc:`XRayExample` - An example of how to debug an application with XRay. - -:doc:`The Microsoft PDB File Format ` - A detailed description of the Microsoft PDB (Program Database) file format. - -:doc:`CFIVerify` - A description of the verification tool for Control Flow Integrity. - -:doc:`SpeculativeLoadHardening` - A description of the Speculative Load Hardening mitigation for Spectre v1. - -:doc:`StackSafetyAnalysis` - This document describes the design of the stack safety analysis of local - variables. - -:doc:`LoopTerminology` - A document describing Loops and associated terms as used in LLVM. - -:doc:`Dependence Graphs ` - A description of the design of the various dependence graphs such as - the DDG (Data Dependence Graph). - -================== -Garbage Collection -================== - -:doc:`GarbageCollection` - The interfaces source-language compilers should use for compiling GC'd - programs. - -:doc:`Statepoints` - This describes a set of experimental extensions for garbage - collection support. diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst index d75769a793899c..5c035d1717d6a9 100644 --- a/llvm/docs/UserGuides.rst +++ b/llvm/docs/UserGuides.rst @@ -1,60 +1,60 @@ User Guides =========== -For those new to the LLVM system. - NOTE: If you are a user who is only interested in using an LLVM-based compiler, you should look into `Clang `_ instead. The documentation here is intended for users who have a need to work with the intermediate LLVM representation. +.. contents:: + :local: + .. toctree:: :hidden: + AddingConstrainedIntrinsics + AdvancedBuilds + AliasAnalysis + AMDGPUUsage + Benchmarking + BigEndianNEON + BuildingADistribution CMake CMakePrimer - AdvancedBuilds + CodeGenerator + CodeOfConduct + CommandLine + CompileCudaWithLLVM + CoverageMappingFormat + DebuggingJITedCode + Docker + ExtendingLLVM + GoldPlugin HowToBuildOnARM HowToBuildWithPGO HowToCrossCompileBuiltinsOnArm HowToCrossCompileLLVM - yaml2obj + LinkTimeOptimization + LoopTerminology MarkdownQuickstartTemplate + MergeFunctions + MCJITDesignAndImplementation + NVPTXUsage Phabricator Passes - YamlIO - GetElementPtr - Frontend/PerformanceTips - MCJITDesignAndImplementation - ORCv2 - CodeOfConduct - CompileCudaWithLLVM ReportingGuide - Benchmarking - Docker - BuildingADistribution Remarks + StackSafetyAnalysis + SourceLevelDebugging + TableGen/index + TableGenFundamentals + Vectorizers + WritingAnLLVMPass + WritingAnLLVMBackend + yaml2obj -Building, Packaging, and Distributing LLVM ------------------------------------------- - -How to build, package, and distribute LLVM. - -===== -CMake -===== - -:doc:`BuildingADistribution` - A best-practices guide for using LLVM's CMake build system to package and - distribute LLVM-based tools. - -:doc:`CMake` - An addendum to the main Getting Started guide for those using the `CMake - build system `_. - -===== Clang -===== +----- :doc:`HowToBuildOnARM` Notes on building and testing LLVM/Clang on ARM. @@ -70,36 +70,113 @@ Clang .. __: http://clang.llvm.org/get_started.html -====== -Docker -====== +:doc:`CoverageMappingFormat` + This describes the format and encoding used for LLVM’s code coverage mapping. + +LLVM Builds and Distributions +----------------------------- + +:doc:`BuildingADistribution` + A best-practices guide for using LLVM's CMake build system to package and + distribute LLVM-based tools. + +:doc:`CMake` + An addendum to the main Getting Started guide for those using the `CMake + build system `_. :doc:`Docker` A reference for using Dockerfiles provided with LLVM. -================= -Additional Topics -================= +Optimizations +------------- -:doc:`HowToCrossCompileBuiltinsOnArm` - Notes on cross-building and testing the compiler-rt builtins for Arm. +:doc:`WritingAnLLVMPass` + Information on how to write LLVM transformations and analyses. :doc:`Passes` A list of optimizations and analyses implemented in LLVM. -:doc:`TestSuiteGuide` - Describes how to compile and run the test-suite benchmarks. +:doc:`StackSafetyAnalysis` + This document describes the design of the stack safety analysis of local + variables. + +:doc:`MergeFunctions` + Describes functions merging optimization. + +:doc:`AliasAnalysis` + Information on how to write a new alias analysis implementation or how to + use existing analyses. + +:doc:`LoopTerminology` + A document describing Loops and associated terms as used in LLVM. -:doc:`YamlIO` - A reference guide for using LLVM's YAML I/O library. +:doc:`Vectorizers` + This document describes the current status of vectorization in LLVM. -:doc:`GetElementPtr` - Answers to some very frequent questions about LLVM's most frequently - misunderstood instruction. +:doc:`LinkTimeOptimization` + This document describes the interface between LLVM intermodular optimizer + and the linker and its design -:doc:`Frontend/PerformanceTips` - A collection of tips for frontend authors on how to generate IR - which LLVM is able to effectively optimize. +:doc:`GoldPlugin` + How to build your programs with link-time optimization on Linux. :doc:`Remarks` - A reference on the implementation of remarks in LLVM. \ No newline at end of file + A reference on the implementation of remarks in LLVM. + +:doc:`Source Level Debugging with LLVM ` + This document describes the design and philosophy behind the LLVM + source-level debugger. + +Code Generation +--------------- + +:doc:`WritingAnLLVMBackend` + Information on how to write LLVM backends for machine targets. + +:doc:`CodeGenerator` + The design and implementation of the LLVM code generator. Useful if you are + working on retargetting LLVM to a new architecture, designing a new codegen + pass, or enhancing existing components. + +:doc:`TableGen ` + Describes the TableGen tool, which is used heavily by the LLVM code + generator. + +=== +JIT +=== + +:doc:`MCJITDesignAndImplementation` + Describes the inner workings of MCJIT execution engine. + +:doc:`DebuggingJITedCode` + How to debug JITed code with GDB. + +Additional Topics +----------------- + +:doc:`CommandLine` + Provides information on using the command line parsing library. + +:doc:`ExtendingLLVM` + Look here to see how to add instructions and intrinsics to LLVM. + +:doc:`AddingConstrainedIntrinsics` + Gives the steps necessary when adding a new constrained math intrinsic + to LLVM. + +:doc:`HowToCrossCompileBuiltinsOnArm` + Notes on cross-building and testing the compiler-rt builtins for Arm. + +:doc:`BigEndianNEON` + LLVM's support for generating NEON instructions on big endian ARM targets is + somewhat nonintuitive. This document explains the implementation and rationale. + +:doc:`CompileCudaWithLLVM` + LLVM support for CUDA. + +:doc:`NVPTXUsage` + This document describes using the NVPTX backend to compile GPU kernels. + +:doc:`AMDGPUUsage` + This document describes using the AMDGPU backend to compile GPU kernels. \ No newline at end of file diff --git a/llvm/docs/index.rst b/llvm/docs/index.rst index 17a0706a196bd8..531616d69129cc 100644 --- a/llvm/docs/index.rst +++ b/llvm/docs/index.rst @@ -53,46 +53,19 @@ Getting Started, How-tos, Developer Guides, and Tutorials. .. toctree:: :hidden: - UserGuides - ProgrammingDocumentation + GettingStartedTutorials Reference - SubsystemDocumentation + UserGuides -:doc:`UserGuides` +:doc:`GettingStartedTutorials` For those new to the LLVM system. -:doc:`ProgrammingDocumentation` - For developers of applications which use LLVM as a library. - -:doc:`SubsystemDocumentation` - For API clients and LLVM developers. +:doc:`UserGuides` + User guides and How-tos. :doc:`Reference` LLVM and API reference documentation. -Getting Started/Tutorials -------------------------- - -.. toctree:: - :hidden: - - GettingStarted - tutorial/index - GettingStartedVS - -:doc:`GettingStarted` - Discusses how to get up and running quickly with the LLVM infrastructure. - Everything from unpacking and compilation of the distribution to execution - of some tools. - -:doc:`tutorial/index` - Tutorials about using LLVM. Includes a tutorial about making a custom - language with LLVM. - -:doc:`GettingStartedVS` - An addendum to the main Getting Started guide for those using Visual Studio - on Windows. - Community ========= diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h index c96faa2fbba4ef..e9e0947620ad4c 100644 --- a/llvm/include/llvm-c/DebugInfo.h +++ b/llvm/include/llvm-c/DebugInfo.h @@ -169,6 +169,19 @@ typedef unsigned LLVMMetadataKind; */ typedef unsigned LLVMDWARFTypeEncoding; +/** + * Describes the kind of macro declaration used for LLVMDIBuilderCreateMacro. + * @see llvm::dwarf::MacinfoRecordType + * @note Values are from DW_MACINFO_* constants in the DWARF specification. + */ +typedef enum { + LLVMDWARFMacinfoRecordTypeDefine = 0x01, + LLVMDWARFMacinfoRecordTypeMacro = 0x02, + LLVMDWARFMacinfoRecordTypeStartFile = 0x03, + LLVMDWARFMacinfoRecordTypeEndFile = 0x04, + LLVMDWARFMacinfoRecordTypeVendorExt = 0xff +} LLVMDWARFMacinfoRecordType; + /** * The current debug metadata version number. */ @@ -521,6 +534,38 @@ LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Builder, unsigned NumParameterTypes, LLVMDIFlags Flags); +/** + * Create debugging information entry for a macro. + * @param Builder The DIBuilder. + * @param ParentMacroFile Macro parent (could be NULL). + * @param Line Source line number where the macro is defined. + * @param MacroType DW_MACINFO_define or DW_MACINFO_undef. + * @param Name Macro name. + * @param NameLen Macro name length. + * @param Value Macro value. + * @param ValueLen Macro value length. + */ +LLVMMetadataRef LLVMDIBuilderCreateMacro(LLVMDIBuilderRef Builder, + LLVMMetadataRef ParentMacroFile, + unsigned Line, + LLVMDWARFMacinfoRecordType RecordType, + const char *Name, size_t NameLen, + const char *Value, size_t ValueLen); + +/** + * Create debugging information temporary entry for a macro file. + * List of macro node direct children will be calculated by DIBuilder, + * using the \p ParentMacroFile relationship. + * @param Builder The DIBuilder. + * @param ParentMacroFile Macro parent (could be NULL). + * @param Line Source line number where the macro file is included. + * @param File File descriptor containing the name of the macro file. + */ +LLVMMetadataRef +LLVMDIBuilderCreateTempMacroFile(LLVMDIBuilderRef Builder, + LLVMMetadataRef ParentMacroFile, unsigned Line, + LLVMMetadataRef File); + /** * Create debugging information entry for an enumerator. * @param Builder The DIBuilder. diff --git a/llvm/include/llvm/ADT/iterator_range.h b/llvm/include/llvm/ADT/iterator_range.h index 774c7c4e3366e5..aa8830943cabc1 100644 --- a/llvm/include/llvm/ADT/iterator_range.h +++ b/llvm/include/llvm/ADT/iterator_range.h @@ -44,6 +44,7 @@ class iterator_range { IteratorT begin() const { return begin_iterator; } IteratorT end() const { return end_iterator; } + bool empty() const { return begin_iterator == end_iterator; } }; /// Convenience function for iterating over sub-ranges. diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index decd4dd3a96596..1a397068caf0ab 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -391,7 +391,7 @@ enum CastOpcodes { /// have no fixed relation to the LLVM IR enum values. Changing these will /// break compatibility with old files. enum UnaryOpcodes { - UNOP_NEG = 0 + UNOP_FNEG = 0 }; /// BinaryOpcodes - These are values used in the bitcode files to encode which diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h index 734531a65d5053..f8f6b5448f3f14 100644 --- a/llvm/include/llvm/CodeGen/AccelTable.h +++ b/llvm/include/llvm/CodeGen/AccelTable.h @@ -101,8 +101,6 @@ /// /// An Apple Accelerator Table can be serialized by calling emitAppleAccelTable /// function. -/// -/// TODO: Add DWARF v5 emission code. namespace llvm { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index a5fab5aad1983f..bf60319996a79f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -230,6 +230,8 @@ class LegalizerHelper { LegalizeResult lowerUnmergeValues(MachineInstr &MI); LegalizeResult lowerShuffleVector(MachineInstr &MI); LegalizeResult lowerDynStackAlloc(MachineInstr &MI); + LegalizeResult lowerExtract(MachineInstr &MI); + LegalizeResult lowerInsert(MachineInstr &MI); private: MachineRegisterInfo &MRI; diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 1059b6bd41b3a1..d052cfb43591c3 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -301,6 +301,7 @@ namespace ISD { STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2, STRICT_FRINT, STRICT_FNEARBYINT, STRICT_FMAXNUM, STRICT_FMINNUM, STRICT_FCEIL, STRICT_FFLOOR, STRICT_FROUND, STRICT_FTRUNC, + STRICT_LROUND, STRICT_LLROUND, STRICT_LRINT, STRICT_LLRINT, /// STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or /// unsigned integer. These have the same semantics as fptosi and fptoui diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index 94e76a75e8da6f..069d0aa45095b7 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -314,6 +314,7 @@ struct ScalarEnumerationTraits { static void enumeration(yaml::IO &IO, TargetStackID::Value &ID) { IO.enumCase(ID, "default", TargetStackID::Default); IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill); + IO.enumCase(ID, "sve-vec", TargetStackID::SVEVector); IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc); } }; diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index d3ebe00c1c059a..00c8ca767ad7d3 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -618,6 +618,12 @@ class MachineInstr return hasPropertyInBundle(1ULL << MCFlag, Type); } + /// Return true if this is an instruction that should go through the usual + /// legalization steps. + bool isPreISelOpcode(QueryType Type = IgnoreBundle) const { + return hasProperty(MCID::PreISelOpcode, Type); + } + /// Return true if this instruction can have a variable number of operands. /// In this case, the variable operands will be after the normal /// operands but before the implicit definitions and uses (if any are diff --git a/llvm/include/llvm/CodeGen/MachineLoopUtils.h b/llvm/include/llvm/CodeGen/MachineLoopUtils.h new file mode 100644 index 00000000000000..41379b75d00a6a --- /dev/null +++ b/llvm/include/llvm/CodeGen/MachineLoopUtils.h @@ -0,0 +1,41 @@ +//=- MachineLoopUtils.h - Helper functions for manipulating loops -*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H +#define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H + +namespace llvm { +class MachineBasicBlock; +class MachineRegisterInfo; +class TargetInstrInfo; + +enum LoopPeelDirection { + LPD_Front, ///< Peel the first iteration of the loop. + LPD_Back ///< Peel the last iteration of the loop. +}; + +/// Peels a single block loop. Loop must have two successors, one of which +/// must be itself. Similarly it must have two predecessors, one of which must +/// be itself. +/// +/// The loop block is copied and inserted into the CFG such that two copies of +/// the loop follow on from each other. The copy is inserted either before or +/// after the loop based on Direction. +/// +/// Phis are updated and an unconditional branch inserted at the end of the +/// clone so as to execute a single iteration. +/// +/// The trip count of Loop is not updated. +MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction, + MachineBasicBlock *Loop, + MachineRegisterInfo &MRI, + const TargetInstrInfo *TII); + +} // namespace llvm + +#endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h index 65f706302bc2ec..33a48a235e18c6 100644 --- a/llvm/include/llvm/CodeGen/MachineMemOperand.h +++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h @@ -293,8 +293,6 @@ class MachineMemOperand { /// Support for operator<<. /// @{ - void print(raw_ostream &OS) const; - void print(raw_ostream &OS, ModuleSlotTracker &MST) const; void print(raw_ostream &OS, ModuleSlotTracker &MST, SmallVectorImpl &SSNs, const LLVMContext &Context, const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const; @@ -319,11 +317,6 @@ class MachineMemOperand { } }; -inline raw_ostream &operator<<(raw_ostream &OS, const MachineMemOperand &MRO) { - MRO.print(OS); - return OS; -} - } // End llvm namespace #endif diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h index 36cc843c8849eb..81a9b63b64ca31 100644 --- a/llvm/include/llvm/CodeGen/ModuloSchedule.h +++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h @@ -62,8 +62,10 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include #include namespace llvm { @@ -142,9 +144,7 @@ class ModuloSchedule { /// Return the rescheduled instructions in order. ArrayRef getInstructions() { return ScheduledInstrs; } - void dump() { - print(dbgs()); - } + void dump() { print(dbgs()); } void print(raw_ostream &OS); }; @@ -270,9 +270,6 @@ class ModuloScheduleExpander { /// A reimplementation of ModuloScheduleExpander. It works by generating a /// standalone kernel loop and peeling out the prologs and epilogs. -/// -/// FIXME: This implementation cannot yet generate valid code. It can generate -/// a correct kernel but cannot peel out prologs and epilogs. class PeelingModuloScheduleExpander { ModuloSchedule &Schedule; MachineFunction &MF; @@ -281,17 +278,70 @@ class PeelingModuloScheduleExpander { const TargetInstrInfo *TII; LiveIntervals *LIS; + /// The original loop block that gets rewritten in-place. MachineBasicBlock *BB; + /// The original loop preheader. MachineBasicBlock *Preheader; + /// All prolog and epilog blocks. + SmallVector Prologs, Epilogs; + /// For every block, the stages that are produced. + DenseMap LiveStages; + /// For every block, the stages that are available. A stage can be available + /// but not produced (in the epilog) or produced but not available (in the + /// prolog). + DenseMap AvailableStages; + + /// CanonicalMIs and BlockMIs form a bidirectional map between any of the + /// loop kernel clones. + DenseMap CanonicalMIs; + DenseMap, MachineInstr *> + BlockMIs; + + /// State passed from peelKernel to peelPrologAndEpilogs(). + std::deque PeeledFront, PeeledBack; + public: PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, LiveIntervals *LIS) : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), TII(ST.getInstrInfo()), LIS(LIS) {} + void expand(); + /// Runs ModuloScheduleExpander and treats it as a golden input to validate /// aspects of the code generated by PeelingModuloScheduleExpander. void validateAgainstModuloScheduleExpander(); + +protected: + /// Converts BB from the original loop body to the rewritten, pipelined + /// steady-state. + void rewriteKernel(); + +private: + /// Peels one iteration of the rewritten kernel (BB) in the specified + /// direction. + MachineBasicBlock *peelKernel(LoopPeelDirection LPD); + /// Peel the kernel forwards and backwards to produce prologs and epilogs, + /// and stitch them together. + void peelPrologAndEpilogs(); + /// All prolog and epilog blocks are clones of the kernel, so any produced + /// register in one block has an corollary in all other blocks. + Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB); + /// Change all users of MI, if MI is predicated out + /// (LiveStages[MI->getParent()] == false). + void rewriteUsesOf(MachineInstr *MI); + /// Insert branches between prologs, kernel and epilogs. + void fixupBranches(); + /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block + /// to a block dominated by all prologs and epilogs. This allows us to treat + /// the loop exiting block as any other kernel clone. + MachineBasicBlock *CreateLCSSAExitingBlock(); + /// Helper to get the stage of an instruction in the schedule. + unsigned getStage(MachineInstr *MI) { + if (CanonicalMIs.count(MI)) + MI = CanonicalMIs[MI]; + return Schedule.getStage(MI); + } }; /// Expander that simply annotates each scheduled instruction with a post-instr diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 2b00b8568705ed..ceb8b72635a29f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -701,12 +701,16 @@ END_TWO_BYTE_PACK() case ISD::STRICT_FLOG: case ISD::STRICT_FLOG10: case ISD::STRICT_FLOG2: + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: case ISD::STRICT_FMAXNUM: case ISD::STRICT_FMINNUM: case ISD::STRICT_FCEIL: case ISD::STRICT_FFLOOR: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: case ISD::STRICT_FROUND: case ISD::STRICT_FTRUNC: case ISD::STRICT_FP_TO_SINT: diff --git a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h index 31b5f794d90c69..b8adcf759b197c 100644 --- a/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h +++ b/llvm/include/llvm/CodeGen/SwitchLoweringUtils.h @@ -212,13 +212,14 @@ struct BitTestBlock { BitTestInfo Cases; BranchProbability Prob; BranchProbability DefaultProb; + bool OmitRangeCheck; BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT, bool E, bool CR, MachineBasicBlock *P, MachineBasicBlock *D, BitTestInfo C, BranchProbability Pr) : First(std::move(F)), Range(std::move(R)), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E), ContiguousRange(CR), Parent(P), Default(D), - Cases(std::move(C)), Prob(Pr) {} + Cases(std::move(C)), Prob(Pr), OmitRangeCheck(false) {} }; /// Return the range of values within a range. diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index 284f7ba64dbaa7..6e4a723b426fc6 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -28,6 +28,7 @@ namespace TargetStackID { enum Value { Default = 0, SGPRSpill = 1, + SVEVector = 2, NoAlloc = 255 }; } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 999c0ea30223e9..4ab61edec25fa6 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -953,12 +953,16 @@ class TargetLoweringBase { case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break; case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break; case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; + case ISD::STRICT_LRINT: EqOpc = ISD::LRINT; break; + case ISD::STRICT_LLRINT: EqOpc = ISD::LLRINT; break; case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break; case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break; case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break; case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break; + case ISD::STRICT_LROUND: EqOpc = ISD::LROUND; break; + case ISD::STRICT_LLROUND: EqOpc = ISD::LLROUND; break; case ISD::STRICT_FROUND: EqOpc = ISD::FROUND; break; case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break; case ISD::STRICT_FP_TO_SINT: EqOpc = ISD::FP_TO_SINT; break; @@ -3263,6 +3267,8 @@ class TargetLowering : public TargetLoweringBase { SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true); SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true); + bool recursivelyDeleteUnusedNodes(SDNode *N); + void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO); }; diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h b/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h index 37293dfb8eda47..72687682f606c6 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/EHFrameSupport.h @@ -81,7 +81,7 @@ using StoreFrameRangeFunction = /// Authors of JITLinkContexts can use this function to register a post-fixup /// pass that records the range of the eh-frame section. This range can /// be used after finalization to register and deregister the frame. -AtomGraphPassFunction +LinkGraphPassFunction createEHFrameRecorderPass(const Triple &TT, StoreFrameRangeFunction StoreFrameRange); diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h index be80d44ccf51cf..b531127cf89282 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -34,6 +34,9 @@ namespace llvm { namespace jitlink { +class Symbol; +class Section; + /// Base class for errors originating in JIT linker, e.g. missing relocation /// support. class JITLinkError : public ErrorInfo { @@ -50,27 +53,22 @@ class JITLinkError : public ErrorInfo { std::string ErrMsg; }; -// Forward declare the Atom class. -class Atom; - -/// Edge class. Represents both object file relocations, as well as layout and -/// keep-alive constraints. +/// Represents fixups and constraints in the LinkGraph. class Edge { public: using Kind = uint8_t; - using GenericEdgeKind = enum : Kind { + enum GenericEdgeKind : Kind { Invalid, // Invalid edge value. FirstKeepAlive, // Keeps target alive. Offset/addend zero. KeepAlive = FirstKeepAlive, // Tag first edge kind that preserves liveness. - LayoutNext, // Layout constraint. Offset/Addend zero. FirstRelocation // First architecture specific relocation. }; using OffsetT = uint32_t; using AddendT = int64_t; - Edge(Kind K, OffsetT Offset, Atom &Target, AddendT Addend) + Edge(Kind K, OffsetT Offset, Symbol &Target, AddendT Addend) : Target(&Target), Offset(Offset), Addend(Addend), K(K) {} OffsetT getOffset() const { return Offset; } @@ -82,461 +80,637 @@ class Edge { return K - FirstRelocation; } bool isKeepAlive() const { return K >= FirstKeepAlive; } - Atom &getTarget() const { return *Target; } - void setTarget(Atom &Target) { this->Target = &Target; } + Symbol &getTarget() const { return *Target; } + void setTarget(Symbol &Target) { this->Target = &Target; } AddendT getAddend() const { return Addend; } void setAddend(AddendT Addend) { this->Addend = Addend; } private: - Atom *Target; - OffsetT Offset; - AddendT Addend; + Symbol *Target = nullptr; + OffsetT Offset = 0; + AddendT Addend = 0; Kind K = 0; }; -using EdgeVector = std::vector; +/// Returns the string name of the given generic edge kind, or "unknown" +/// otherwise. Useful for debugging. +const char *getGenericEdgeKindName(Edge::Kind K); -const StringRef getGenericEdgeKindName(Edge::Kind K); - -/// Base Atom class. Used by absolute and undefined atoms. -class Atom { - friend class AtomGraph; +/// Base class for Addressable entities (externals, absolutes, blocks). +class Addressable { + friend class LinkGraph; protected: - /// Create a named (as yet unresolved) atom. - Atom(StringRef Name) - : Name(Name), IsDefined(false), IsLive(false), ShouldDiscard(false), - IsGlobal(false), IsAbsolute(false), IsCallable(false), - IsExported(false), IsWeak(false), HasLayoutNext(false), - IsCommon(false) {} - - /// Create an absolute symbol atom. - Atom(StringRef Name, JITTargetAddress Address) - : Name(Name), Address(Address), IsDefined(true), IsLive(false), - ShouldDiscard(false), IsGlobal(false), IsAbsolute(false), - IsCallable(false), IsExported(false), IsWeak(false), - HasLayoutNext(false), IsCommon(false) {} + Addressable(JITTargetAddress Address, bool IsDefined) + : Address(Address), IsDefined(IsDefined), IsAbsolute(false) {} -public: - /// Returns true if this atom has a name. - bool hasName() const { return Name != StringRef(); } + Addressable(JITTargetAddress Address) + : Address(Address), IsDefined(false), IsAbsolute(true) { + assert(!(IsDefined && IsAbsolute) && + "Block cannot be both defined and absolute"); + } - /// Returns the name of this atom. - StringRef getName() const { return Name; } +public: + Addressable(const Addressable &) = delete; + Addressable &operator=(const Addressable &) = default; + Addressable(Addressable &&) = delete; + Addressable &operator=(Addressable &&) = default; - /// Returns the current target address of this atom. - /// The initial target address (for atoms that have one) will be taken from - /// the input object file's virtual address space. During the layout phase - /// of JIT linking the atom's address will be updated to point to its final - /// address in the JIT'd process. JITTargetAddress getAddress() const { return Address; } - - /// Set the current target address of this atom. void setAddress(JITTargetAddress Address) { this->Address = Address; } - /// Returns true if this is a defined atom. - bool isDefined() const { return IsDefined; } + /// Returns true if this is a defined addressable, in which case you + /// can downcast this to a . + bool isDefined() const { return static_cast(IsDefined); } + bool isAbsolute() const { return static_cast(IsAbsolute); } - /// Returns true if this atom is marked as live. - bool isLive() const { return IsLive; } +private: + JITTargetAddress Address = 0; + uint64_t IsDefined : 1; + uint64_t IsAbsolute : 1; +}; - /// Mark this atom as live. - /// - /// Note: Only defined and absolute atoms can be marked live. - void setLive(bool IsLive) { - assert((IsDefined || IsAbsolute || !IsLive) && - "Only defined and absolute atoms can be marked live"); - this->IsLive = IsLive; - } +using BlockOrdinal = unsigned; +using SectionOrdinal = unsigned; - /// Returns true if this atom should be discarded during pruning. - bool shouldDiscard() const { return ShouldDiscard; } +/// An Addressable with content and edges. +class Block : public Addressable { + friend class LinkGraph; - /// Mark this atom to be discarded. - /// - /// Note: Only defined and absolute atoms can be marked live. - void setShouldDiscard(bool ShouldDiscard) { - assert((IsDefined || IsAbsolute || !ShouldDiscard) && - "Only defined and absolute atoms can be marked live"); - this->ShouldDiscard = ShouldDiscard; +private: + /// Create a zero-fill defined addressable. + Block(Section &Parent, BlockOrdinal Ordinal, JITTargetAddress Size, + JITTargetAddress Address, uint64_t Alignment, uint64_t AlignmentOffset) + : Addressable(Address, true), Parent(Parent), Size(Size), + Ordinal(Ordinal) { + assert(isPowerOf2_64(Alignment) && "Alignment must be power of 2"); + assert(AlignmentOffset < Alignment && + "Alignment offset cannot exceed alignment"); + assert(AlignmentOffset <= MaxAlignmentOffset && + "Alignment offset exceeds maximum"); + P2Align = Alignment ? countTrailingZeros(Alignment) : 0; + this->AlignmentOffset = AlignmentOffset; } - /// Returns true if this definition is global (i.e. visible outside this - /// linkage unit). - /// - /// Note: This is distict from Exported, which means visibile outside the - /// JITDylib that this graph is being linked in to. - bool isGlobal() const { return IsGlobal; } + /// Create a defined addressable for the given content. + Block(Section &Parent, BlockOrdinal Ordinal, StringRef Content, + JITTargetAddress Address, uint64_t Alignment, uint64_t AlignmentOffset) + : Addressable(Address, true), Parent(Parent), Data(Content.data()), + Size(Content.size()), Ordinal(Ordinal) { + assert(isPowerOf2_64(Alignment) && "Alignment must be power of 2"); + assert(AlignmentOffset < Alignment && + "Alignment offset cannot exceed alignment"); + assert(AlignmentOffset <= MaxAlignmentOffset && + "Alignment offset exceeds maximum"); + P2Align = Alignment ? countTrailingZeros(Alignment) : 0; + this->AlignmentOffset = AlignmentOffset; + } - /// Mark this atom as global. - void setGlobal(bool IsGlobal) { this->IsGlobal = IsGlobal; } +public: + using EdgeVector = std::vector; + using edge_iterator = EdgeVector::iterator; + using const_edge_iterator = EdgeVector::const_iterator; - /// Returns true if this atom represents an absolute symbol. - bool isAbsolute() const { return IsAbsolute; } + Block(const Block &) = delete; + Block &operator=(const Block &) = delete; + Block(Block &&) = delete; + Block &operator=(Block &&) = delete; - /// Returns true if this atom is known to be callable. + /// Return the parent section for this block. + Section &getSection() const { return Parent; } + + /// Return the ordinal for this block. + BlockOrdinal getOrdinal() const { return Ordinal; } + + /// Returns true if this is a zero-fill block. /// - /// Primarily provided for easy interoperability with ORC, which uses the - /// JITSymbolFlags::Common flag to identify symbols that can be interposed - /// with stubs. - bool isCallable() const { return IsCallable; } + /// If true, getSize is callable but getContent is not (the content is + /// defined to be a sequence of zero bytes of length Size). + bool isZeroFill() const { return !Data; } + + /// Returns the size of this defined addressable. + size_t getSize() const { return Size; } + + /// Get the content for this block. Block must not be a zero-fill block. + StringRef getContent() const { + assert(Data && "Section does not contain content"); + return StringRef(Data, Size); + } - /// Mark this atom as callable. - void setCallable(bool IsCallable) { - assert((IsDefined || IsAbsolute || !IsCallable) && - "Callable atoms must be defined or absolute"); - this->IsCallable = IsCallable; + /// Set the content for this block. + /// Caller is responsible for ensuring the underlying bytes are not + /// deallocated while pointed to by this block. + void setContent(StringRef Content) { + Data = Content.data(); + Size = Content.size(); } - /// Returns true if this atom should appear in the symbol table of a final - /// linked image. - bool isExported() const { return IsExported; } + /// Get the alignment for this content. + uint64_t getAlignment() const { return 1ull << P2Align; } + + /// Get the alignment offset for this content. + uint64_t getAlignmentOffset() const { return AlignmentOffset; } - /// Mark this atom as exported. - void setExported(bool IsExported) { - assert((!IsExported || ((IsDefined || IsAbsolute) && hasName())) && - "Exported atoms must have names"); - this->IsExported = IsExported; + /// Add an edge to this block. + void addEdge(Edge::Kind K, Edge::OffsetT Offset, Symbol &Target, + Edge::AddendT Addend) { + Edges.push_back(Edge(K, Offset, Target, Addend)); } - /// Returns true if this is a weak symbol. - bool isWeak() const { return IsWeak; } + /// Return the list of edges attached to this content. + iterator_range edges() { + return make_range(Edges.begin(), Edges.end()); + } - /// Mark this atom as weak. - void setWeak(bool IsWeak) { this->IsWeak = IsWeak; } + /// Returns the list of edges attached to this content. + iterator_range edges() const { + return make_range(Edges.begin(), Edges.end()); + } -private: - StringRef Name; - JITTargetAddress Address = 0; + /// Return the size of the edges list. + size_t edges_size() const { return Edges.size(); } - bool IsDefined : 1; - bool IsLive : 1; - bool ShouldDiscard : 1; + /// Returns true if the list of edges is empty. + bool edges_empty() const { return Edges.empty(); } - bool IsGlobal : 1; - bool IsAbsolute : 1; - bool IsCallable : 1; - bool IsExported : 1; - bool IsWeak : 1; +private: + static constexpr uint64_t MaxAlignmentOffset = (1ULL << 57) - 1; -protected: - // These flags only make sense for DefinedAtom, but we can minimize the size - // of DefinedAtom by defining them here. - bool HasLayoutNext : 1; - bool IsCommon : 1; + uint64_t P2Align : 5; + uint64_t AlignmentOffset : 57; + Section &Parent; + const char *Data = nullptr; + size_t Size = 0; + BlockOrdinal Ordinal = 0; + std::vector Edges; }; -// Forward declare DefinedAtom. -class DefinedAtom; +/// Describes symbol linkage. This can be used to make resolve definition +/// clashes. +enum class Linkage : uint8_t { + Strong, + Weak, +}; -raw_ostream &operator<<(raw_ostream &OS, const Atom &A); -void printEdge(raw_ostream &OS, const Atom &FixupAtom, const Edge &E, - StringRef EdgeKindName); +/// For errors and debugging output. +const char *getLinkageName(Linkage L); + +/// Defines the scope in which this symbol should be visible: +/// Default -- Visible in the public interface of the linkage unit. +/// Hidden -- Visible within the linkage unit, but not exported from it. +/// Local -- Visible only within the LinkGraph. +enum class Scope : uint8_t { Default, Hidden, Local }; + +/// For debugging output. +const char *getScopeName(Scope S); + +raw_ostream &operator<<(raw_ostream &OS, const Block &B); + +/// Symbol representation. +/// +/// Symbols represent locations within Addressable objects. +/// They can be either Named or Anonymous. +/// Anonymous symbols have neither linkage nor visibility, and must point at +/// ContentBlocks. +/// Named symbols may be in one of four states: +/// - Null: Default initialized. Assignable, but otherwise unusable. +/// - Defined: Has both linkage and visibility and points to a ContentBlock +/// - Common: Has both linkage and visibility, points to a null Addressable. +/// - External: Has neither linkage nor visibility, points to an external +/// Addressable. +/// +class Symbol { + friend class LinkGraph; + +private: + Symbol(Addressable &Base, JITTargetAddress Offset, StringRef Name, + JITTargetAddress Size, Linkage L, Scope S, bool IsLive, + bool IsCallable) + : Name(Name), Base(&Base), Offset(Offset), Size(Size) { + setLinkage(L); + setScope(S); + setLive(IsLive); + setCallable(IsCallable); + } + + static Symbol &constructCommon(void *SymStorage, Block &Base, StringRef Name, + JITTargetAddress Size, Scope S, bool IsLive) { + assert(SymStorage && "Storage cannot be null"); + assert(!Name.empty() && "Common symbol name cannot be empty"); + assert(Base.isDefined() && + "Cannot create common symbol from undefined block"); + assert(static_cast(Base).getSize() == Size && + "Common symbol size should match underlying block size"); + auto *Sym = reinterpret_cast(SymStorage); + new (Sym) Symbol(Base, 0, Name, Size, Linkage::Weak, S, IsLive, false); + return *Sym; + } + + static Symbol &constructExternal(void *SymStorage, Addressable &Base, + StringRef Name, JITTargetAddress Size) { + assert(SymStorage && "Storage cannot be null"); + assert(!Base.isDefined() && + "Cannot create external symbol from defined block"); + assert(!Name.empty() && "External symbol name cannot be empty"); + auto *Sym = reinterpret_cast(SymStorage); + new (Sym) Symbol(Base, 0, Name, Size, Linkage::Strong, Scope::Default, + false, false); + return *Sym; + } + + static Symbol &constructAbsolute(void *SymStorage, Addressable &Base, + StringRef Name, JITTargetAddress Size, + Linkage L, Scope S, bool IsLive) { + assert(SymStorage && "Storage cannot be null"); + assert(!Base.isDefined() && + "Cannot create absolute symbol from a defined block"); + auto *Sym = reinterpret_cast(SymStorage); + new (Sym) Symbol(Base, 0, Name, Size, L, S, IsLive, false); + return *Sym; + } + + static Symbol &constructAnonDef(void *SymStorage, Block &Base, + JITTargetAddress Offset, + JITTargetAddress Size, bool IsCallable, + bool IsLive) { + assert(SymStorage && "Storage cannot be null"); + auto *Sym = reinterpret_cast(SymStorage); + new (Sym) Symbol(Base, Offset, StringRef(), Size, Linkage::Strong, + Scope::Local, IsLive, IsCallable); + return *Sym; + } + + static Symbol &constructNamedDef(void *SymStorage, Block &Base, + JITTargetAddress Offset, StringRef Name, + JITTargetAddress Size, Linkage L, Scope S, + bool IsLive, bool IsCallable) { + assert(SymStorage && "Storage cannot be null"); + assert(!Name.empty() && "Name cannot be empty"); + auto *Sym = reinterpret_cast(SymStorage); + new (Sym) Symbol(Base, Offset, Name, Size, L, S, IsLive, IsCallable); + return *Sym; + } -/// Represents a section address range via a pair of DefinedAtom pointers to -/// the first and last atoms in the section. -class SectionRange { public: - SectionRange() = default; - SectionRange(DefinedAtom *First, DefinedAtom *Last) - : First(First), Last(Last) {} - DefinedAtom *getFirstAtom() const { - assert((!Last || First) && "First can not be null if end is non-null"); - return First; + /// Create a null Symbol. This allows Symbols to be default initialized for + /// use in containers (e.g. as map values). Null symbols are only useful for + /// assigning to. + Symbol() = default; + + // Symbols are not movable or copyable. + Symbol(const Symbol &) = delete; + Symbol &operator=(const Symbol &) = delete; + Symbol(Symbol &&) = delete; + Symbol &operator=(Symbol &&) = delete; + + /// Returns true if this symbol has a name. + bool hasName() const { return !Name.empty(); } + + /// Returns the name of this symbol (empty if the symbol is anonymous). + StringRef getName() const { + assert((!Name.empty() || getScope() == Scope::Local) && + "Anonymous symbol has non-local scope"); + return Name; } - DefinedAtom *getLastAtom() const { - assert((First || !Last) && "Last can not be null if start is non-null"); - return Last; + + /// Returns true if this Symbol has content (potentially) defined within this + /// object file (i.e. is anything but an external or absolute symbol). + bool isDefined() const { + assert(Base && "Attempt to access null symbol"); + return Base->isDefined(); } - bool isEmpty() const { - assert((First || !Last) && "Last can not be null if start is non-null"); - return !First; + + /// Returns true if this symbol is live (i.e. should be treated as a root for + /// dead stripping). + bool isLive() const { + assert(Base && "Attempting to access null symbol"); + return IsLive; } - JITTargetAddress getStart() const; - JITTargetAddress getEnd() const; - uint64_t getSize() const; -private: - DefinedAtom *First = nullptr; - DefinedAtom *Last = nullptr; -}; + /// Set this symbol's live bit. + void setLive(bool IsLive) { this->IsLive = IsLive; } -/// Represents an object file section. -class Section { - friend class AtomGraph; + /// Returns true is this symbol is callable. + bool isCallable() const { return IsCallable; } -private: - Section(StringRef Name, uint32_t Alignment, sys::Memory::ProtectionFlags Prot, - unsigned Ordinal, bool IsZeroFill) - : Name(Name), Alignment(Alignment), Prot(Prot), Ordinal(Ordinal), - IsZeroFill(IsZeroFill) { - assert(isPowerOf2_32(Alignment) && "Alignments must be a power of 2"); + /// Set this symbol's callable bit. + void setCallable(bool IsCallable) { this->IsCallable = IsCallable; } + + /// Returns true if the underlying addressable is an unresolved external. + bool isExternal() const { + assert(Base && "Attempt to access null symbol"); + return !Base->isDefined() && !Base->isAbsolute(); } - using DefinedAtomSet = DenseSet; + /// Returns true if the underlying addressable is an absolute symbol. + bool isAbsolute() const { + assert(Base && "Attempt to access null symbol"); + return !Base->isDefined() && Base->isAbsolute(); + } -public: - using atom_iterator = DefinedAtomSet::iterator; - using const_atom_iterator = DefinedAtomSet::const_iterator; + /// Return the addressable that this symbol points to. + Addressable &getAddressable() { + assert(Base && "Cannot get underlying addressable for null symbol"); + return *Base; + } - ~Section(); - StringRef getName() const { return Name; } - uint32_t getAlignment() const { return Alignment; } - sys::Memory::ProtectionFlags getProtectionFlags() const { return Prot; } - unsigned getSectionOrdinal() const { return Ordinal; } - size_t getNextAtomOrdinal() { return ++NextAtomOrdinal; } + /// Return the addressable that thsi symbol points to. + const Addressable &getAddressable() const { + assert(Base && "Cannot get underlying addressable for null symbol"); + return *Base; + } - bool isZeroFill() const { return IsZeroFill; } + /// Return the Block for this Symbol (Symbol must be defined). + Block &getBlock() { + assert(Base && "Cannot get block for null symbol"); + assert(Base->isDefined() && "Not a defined symbol"); + return static_cast(*Base); + } - /// Returns an iterator over the atoms in the section (in no particular - /// order). - iterator_range atoms() { - return make_range(DefinedAtoms.begin(), DefinedAtoms.end()); + /// Return the Block for this Symbol (Symbol must be defined). + const Block &getBlock() const { + assert(Base && "Cannot get block for null symbol"); + assert(Base->isDefined() && "Not a defined symbol"); + return static_cast(*Base); } - /// Returns an iterator over the atoms in the section (in no particular - /// order). - iterator_range atoms() const { - return make_range(DefinedAtoms.begin(), DefinedAtoms.end()); + /// Returns the offset for this symbol within the underlying addressable. + JITTargetAddress getOffset() const { return Offset; } + + /// Returns the address of this symbol. + JITTargetAddress getAddress() const { return Base->getAddress() + Offset; } + + /// Returns the size of this symbol. + JITTargetAddress getSize() const { return Size; } + + /// Returns true if this symbol is backed by a zero-fill block. + /// This method may only be called on defined symbols. + bool isSymbolZeroFill() const { return getBlock().isZeroFill(); } + + /// Returns the content in the underlying block covered by this symbol. + /// This method may only be called on defined non-zero-fill symbols. + StringRef getSymbolContent() const { + return getBlock().getContent().substr(Offset, Size); } - /// Return the number of atoms in this section. - DefinedAtomSet::size_type atoms_size() { return DefinedAtoms.size(); } + /// Get the linkage for this Symbol. + Linkage getLinkage() const { return static_cast(L); } - /// Return true if this section contains no atoms. - bool atoms_empty() const { return DefinedAtoms.empty(); } + /// Set the linkage for this Symbol. + void setLinkage(Linkage L) { + assert((L == Linkage::Strong || (Base->isDefined() && !Name.empty())) && + "Linkage can only be applied to defined named symbols"); + this->L = static_cast(L); + } - /// Returns the range of this section as the pair of atoms with the lowest - /// and highest target address. This operation is expensive, as it - /// must traverse all atoms in the section. - /// - /// Note: If the section is empty, both values will be null. The section - /// address will evaluate to null, and the size to zero. If the section - /// contains a single atom both values will point to it, the address will - /// evaluate to the address of that atom, and the size will be the size of - /// that atom. - SectionRange getRange() const; + /// Get the visibility for this Symbol. + Scope getScope() const { return static_cast(S); } -private: - void addAtom(DefinedAtom &DA) { - assert(!DefinedAtoms.count(&DA) && "Atom is already in this section"); - DefinedAtoms.insert(&DA); + /// Set the visibility for this Symbol. + void setScope(Scope S) { + assert((S == Scope::Default || Base->isDefined() || Base->isAbsolute()) && + "Invalid visibility for symbol type"); + this->S = static_cast(S); } - void removeAtom(DefinedAtom &DA) { - assert(DefinedAtoms.count(&DA) && "Atom is not in this section"); - DefinedAtoms.erase(&DA); +private: + void makeExternal(Addressable &A) { + assert(!A.isDefined() && "Attempting to make external with defined block"); + Base = &A; + Offset = 0; + setLinkage(Linkage::Strong); + setScope(Scope::Default); + IsLive = 0; + // note: Size and IsCallable fields left unchanged. } + static constexpr uint64_t MaxOffset = (1ULL << 59) - 1; + + // FIXME: A char* or SymbolStringPtr may pack better. StringRef Name; - uint32_t Alignment = 0; - sys::Memory::ProtectionFlags Prot; - unsigned Ordinal = 0; - unsigned NextAtomOrdinal = 0; - bool IsZeroFill = false; - DefinedAtomSet DefinedAtoms; + Addressable *Base = nullptr; + uint64_t Offset : 59; + uint64_t L : 1; + uint64_t S : 2; + uint64_t IsLive : 1; + uint64_t IsCallable : 1; + JITTargetAddress Size = 0; }; -/// Defined atom class. Suitable for use by defined named and anonymous -/// atoms. -class DefinedAtom : public Atom { - friend class AtomGraph; +raw_ostream &operator<<(raw_ostream &OS, const Symbol &A); + +void printEdge(raw_ostream &OS, const Block &B, const Edge &E, + StringRef EdgeKindName); + +/// Represents an object file section. +class Section { + friend class LinkGraph; private: - DefinedAtom(Section &Parent, JITTargetAddress Address, uint32_t Alignment) - : Atom("", Address), Parent(Parent), Ordinal(Parent.getNextAtomOrdinal()), - Alignment(Alignment) { - assert(isPowerOf2_32(Alignment) && "Alignments must be a power of two"); - } + Section(StringRef Name, sys::Memory::ProtectionFlags Prot, + SectionOrdinal SecOrdinal) + : Name(Name), Prot(Prot), SecOrdinal(SecOrdinal) {} - DefinedAtom(Section &Parent, StringRef Name, JITTargetAddress Address, - uint32_t Alignment) - : Atom(Name, Address), Parent(Parent), - Ordinal(Parent.getNextAtomOrdinal()), Alignment(Alignment) { - assert(isPowerOf2_32(Alignment) && "Alignments must be a power of two"); - } + using SymbolSet = DenseSet; + using BlockSet = DenseSet; public: - using edge_iterator = EdgeVector::iterator; + using symbol_iterator = SymbolSet::iterator; + using const_symbol_iterator = SymbolSet::const_iterator; - Section &getSection() const { return Parent; } + using block_iterator = BlockSet::iterator; + using const_block_iterator = BlockSet::const_iterator; - uint64_t getSize() const { return Size; } + ~Section(); - StringRef getContent() const { - assert(!Parent.isZeroFill() && "Trying to get content for zero-fill atom"); - assert(Size <= std::numeric_limits::max() && - "Content size too large"); - return {ContentPtr, static_cast(Size)}; - } - void setContent(StringRef Content) { - assert(!Parent.isZeroFill() && "Calling setContent on zero-fill atom?"); - ContentPtr = Content.data(); - Size = Content.size(); - } + /// Returns the name of this section. + StringRef getName() const { return Name; } + + /// Returns the protection flags for this section. + sys::Memory::ProtectionFlags getProtectionFlags() const { return Prot; } - bool isZeroFill() const { return Parent.isZeroFill(); } + /// Returns the ordinal for this section. + SectionOrdinal getOrdinal() const { return SecOrdinal; } - void setZeroFill(uint64_t Size) { - assert(Parent.isZeroFill() && !ContentPtr && - "Can't set zero-fill length of a non zero-fill atom"); - this->Size = Size; + /// Returns an iterator over the symbols defined in this section. + iterator_range symbols() { + return make_range(Symbols.begin(), Symbols.end()); } - uint64_t getZeroFillSize() const { - assert(Parent.isZeroFill() && - "Can't get zero-fill length of a non zero-fill atom"); - return Size; + /// Returns an iterator over the symbols defined in this section. + iterator_range symbols() const { + return make_range(Symbols.begin(), Symbols.end()); } - uint32_t getAlignment() const { return Alignment; } + /// Return the number of symbols in this section. + SymbolSet::size_type symbols_size() { return Symbols.size(); } - bool hasLayoutNext() const { return HasLayoutNext; } - void setLayoutNext(DefinedAtom &Next) { - assert(!HasLayoutNext && "Atom already has layout-next constraint"); - HasLayoutNext = true; - Edges.push_back(Edge(Edge::LayoutNext, 0, Next, 0)); - } - DefinedAtom &getLayoutNext() { - assert(HasLayoutNext && "Atom does not have a layout-next constraint"); - DefinedAtom *Next = nullptr; - for (auto &E : edges()) - if (E.getKind() == Edge::LayoutNext) { - assert(E.getTarget().isDefined() && - "layout-next target atom must be a defined atom"); - Next = static_cast(&E.getTarget()); - break; - } - assert(Next && "Missing LayoutNext edge"); - return *Next; - } + /// Return true if this section contains no symbols. + bool symbols_empty() const { return Symbols.empty(); } - bool isCommon() const { return IsCommon; } + /// Returns the ordinal for the next block. + BlockOrdinal getNextBlockOrdinal() { return NextBlockOrdinal++; } - void addEdge(Edge::Kind K, Edge::OffsetT Offset, Atom &Target, - Edge::AddendT Addend) { - assert(K != Edge::LayoutNext && - "Layout edges should be added via setLayoutNext"); - Edges.push_back(Edge(K, Offset, Target, Addend)); +private: + void addSymbol(Symbol &Sym) { + assert(!Symbols.count(&Sym) && "Symbol is already in this section"); + Symbols.insert(&Sym); } - iterator_range edges() { - return make_range(Edges.begin(), Edges.end()); + void removeSymbol(Symbol &Sym) { + assert(Symbols.count(&Sym) && "symbol is not in this section"); + Symbols.erase(&Sym); } - size_t edges_size() const { return Edges.size(); } - bool edges_empty() const { return Edges.empty(); } - unsigned getOrdinal() const { return Ordinal; } + StringRef Name; + sys::Memory::ProtectionFlags Prot; + SectionOrdinal SecOrdinal = 0; + BlockOrdinal NextBlockOrdinal = 0; + SymbolSet Symbols; +}; -private: - void setCommon(uint64_t Size) { - assert(ContentPtr == 0 && "Atom already has content?"); - IsCommon = true; - setZeroFill(Size); +/// Represents a section address range via a pair of Block pointers +/// to the first and last Blocks in the section. +class SectionRange { +public: + SectionRange() = default; + SectionRange(const Section &Sec) { + if (Sec.symbols_empty()) + return; + First = Last = *Sec.symbols().begin(); + for (auto *Sym : Sec.symbols()) { + if (Sym->getAddress() < First->getAddress()) + First = Sym; + if (Sym->getAddress() > Last->getAddress()) + Last = Sym; + } + } + Symbol *getFirstSymbol() const { + assert((!Last || First) && "First can not be null if end is non-null"); + return First; + } + Symbol *getLastSymbol() const { + assert((First || !Last) && "Last can not be null if start is non-null"); + return Last; + } + bool isEmpty() const { + assert((First || !Last) && "Last can not be null if start is non-null"); + return !First; + } + JITTargetAddress getStart() const { + return First ? First->getBlock().getAddress() : 0; + } + JITTargetAddress getEnd() const { + return Last ? Last->getBlock().getAddress() + Last->getBlock().getSize() + : 0; } + uint64_t getSize() const { return getEnd() - getStart(); } - EdgeVector Edges; - uint64_t Size = 0; - Section &Parent; - const char *ContentPtr = nullptr; - unsigned Ordinal = 0; - uint32_t Alignment = 0; +private: + Symbol *First = nullptr; + Symbol *Last = nullptr; }; -inline JITTargetAddress SectionRange::getStart() const { - return First ? First->getAddress() : 0; -} +class LinkGraph { +private: + using SectionList = std::vector>; + using ExternalSymbolSet = DenseSet; + using BlockSet = DenseSet; + + template + Addressable &createAddressable(ArgTs &&... Args) { + Addressable *A = + reinterpret_cast(Allocator.Allocate()); + new (A) Addressable(std::forward(Args)...); + return *A; + } -inline JITTargetAddress SectionRange::getEnd() const { - return Last ? Last->getAddress() + Last->getSize() : 0; -} + void destroyAddressable(Addressable &A) { + A.~Addressable(); + Allocator.Deallocate(&A); + } -inline uint64_t SectionRange::getSize() const { return getEnd() - getStart(); } + template Block &createBlock(ArgTs &&... Args) { + Block *B = reinterpret_cast(Allocator.Allocate()); + new (B) Block(std::forward(Args)...); + Blocks.insert(B); + return *B; + } -inline SectionRange Section::getRange() const { - if (atoms_empty()) - return SectionRange(); - DefinedAtom *First = *DefinedAtoms.begin(), *Last = *DefinedAtoms.begin(); - for (auto *DA : atoms()) { - if (DA->getAddress() < First->getAddress()) - First = DA; - if (DA->getAddress() > Last->getAddress()) - Last = DA; + void destroyBlock(Block &B) { + Blocks.erase(&B); + B.~Block(); + Allocator.Deallocate(&B); } - return SectionRange(First, Last); -} -class AtomGraph { -private: - using SectionList = std::vector>; - using AddressToAtomMap = std::map; - using NamedAtomMap = DenseMap; - using ExternalAtomSet = DenseSet; + void destroySymbol(Symbol &S) { + S.~Symbol(); + Allocator.Deallocate(&S); + } public: - using external_atom_iterator = ExternalAtomSet::iterator; + using external_symbol_iterator = ExternalSymbolSet::iterator; + + using block_iterator = BlockSet::iterator; using section_iterator = pointee_iterator; using const_section_iterator = pointee_iterator; - template - class defined_atom_iterator_impl + template + class defined_symbol_iterator_impl : public iterator_facade_base< - defined_atom_iterator_impl, + defined_symbol_iterator_impl, std::forward_iterator_tag, T> { public: - defined_atom_iterator_impl() = default; + defined_symbol_iterator_impl() = default; - defined_atom_iterator_impl(SecItrT SI, SecItrT SE) - : SI(SI), SE(SE), - AI(SI != SE ? SI->atoms().begin() : Section::atom_iterator()) { - moveToNextAtomOrEnd(); + defined_symbol_iterator_impl(SectionItrT SecI, SectionItrT SecE) + : SecI(SecI), SecE(SecE), + SymI(SecI != SecE ? SecI->symbols().begin() : SymbolItrT()) { + moveToNextSymbolOrEnd(); } - bool operator==(const defined_atom_iterator_impl &RHS) const { - return (SI == RHS.SI) && (AI == RHS.AI); + bool operator==(const defined_symbol_iterator_impl &RHS) const { + return (SecI == RHS.SecI) && (SymI == RHS.SymI); } T operator*() const { - assert(AI != SI->atoms().end() && "Dereferencing end?"); - return *AI; + assert(SymI != SecI->symbols().end() && "Dereferencing end?"); + return *SymI; } - defined_atom_iterator_impl operator++() { - ++AI; - moveToNextAtomOrEnd(); + defined_symbol_iterator_impl operator++() { + ++SymI; + moveToNextSymbolOrEnd(); return *this; } private: - void moveToNextAtomOrEnd() { - while (SI != SE && AI == SI->atoms().end()) { - ++SI; - if (SI == SE) - AI = Section::atom_iterator(); - else - AI = SI->atoms().begin(); + void moveToNextSymbolOrEnd() { + while (SecI != SecE && SymI == SecI->symbols().end()) { + ++SecI; + SymI = SecI == SecE ? SymbolItrT() : SecI->symbols().begin(); } } - SecItrT SI, SE; - AtomItrT AI; + SectionItrT SecI, SecE; + SymbolItrT SymI; }; - using defined_atom_iterator = - defined_atom_iterator_impl; + using defined_symbol_iterator = + defined_symbol_iterator_impl; - using const_defined_atom_iterator = - defined_atom_iterator_impl; + using const_defined_symbol_iterator = defined_symbol_iterator_impl< + const_section_iterator, Section::const_symbol_iterator, const Symbol *>; - AtomGraph(std::string Name, unsigned PointerSize, + LinkGraph(std::string Name, unsigned PointerSize, support::endianness Endianness) : Name(std::move(Name)), PointerSize(PointerSize), Endianness(Endianness) {} + ~LinkGraph(); + /// Returns the name of this graph (usually the name of the original /// underlying MemoryBuffer). const std::string &getName() { return Name; } @@ -544,84 +718,83 @@ class AtomGraph { /// Returns the pointer size for use in this graph. unsigned getPointerSize() const { return PointerSize; } - /// Returns the endianness of atom-content in this graph. + /// Returns the endianness of content in this graph. support::endianness getEndianness() const { return Endianness; } /// Create a section with the given name, protection flags, and alignment. - Section &createSection(StringRef Name, uint32_t Alignment, - sys::Memory::ProtectionFlags Prot, bool IsZeroFill) { - std::unique_ptr
Sec( - new Section(Name, Alignment, Prot, Sections.size(), IsZeroFill)); + Section &createSection(StringRef Name, sys::Memory::ProtectionFlags Prot) { + std::unique_ptr
Sec(new Section(Name, Prot, Sections.size())); Sections.push_back(std::move(Sec)); return *Sections.back(); } - /// Add an external atom representing an undefined symbol in this graph. - Atom &addExternalAtom(StringRef Name) { - assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted"); - Atom *A = reinterpret_cast( - AtomAllocator.Allocate(sizeof(Atom), alignof(Atom))); - new (A) Atom(Name); - ExternalAtoms.insert(A); - NamedAtoms[Name] = A; - return *A; + /// Create a content block. + Block &createContentBlock(Section &Parent, StringRef Content, + uint64_t Address, uint64_t Alignment, + uint64_t AlignmentOffset) { + return createBlock(Parent, Parent.getNextBlockOrdinal(), Content, Address, + Alignment, AlignmentOffset); } - /// Add an external atom representing an absolute symbol. - Atom &addAbsoluteAtom(StringRef Name, JITTargetAddress Addr) { - assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted"); - Atom *A = reinterpret_cast( - AtomAllocator.Allocate(sizeof(Atom), alignof(Atom))); - new (A) Atom(Name, Addr); - AbsoluteAtoms.insert(A); - NamedAtoms[Name] = A; - return *A; + /// Create a zero-fill block. + Block &createZeroFillBlock(Section &Parent, uint64_t Size, uint64_t Address, + uint64_t Alignment, uint64_t AlignmentOffset) { + return createBlock(Parent, Parent.getNextBlockOrdinal(), Size, Address, + Alignment, AlignmentOffset); } - /// Add an anonymous defined atom to the graph. - /// - /// Anonymous atoms have content but no name. They must have an address. - DefinedAtom &addAnonymousAtom(Section &Parent, JITTargetAddress Address, - uint32_t Alignment) { - DefinedAtom *A = reinterpret_cast( - AtomAllocator.Allocate(sizeof(DefinedAtom), alignof(DefinedAtom))); - new (A) DefinedAtom(Parent, Address, Alignment); - Parent.addAtom(*A); - getAddrToAtomMap()[A->getAddress()] = A; - return *A; + /// Add an external symbol. + /// Some formats (e.g. ELF) allow Symbols to have sizes. For Symbols whose + /// size is not known, you should substitute '0'. + Symbol &addExternalSymbol(StringRef Name, uint64_t Size) { + auto &Sym = Symbol::constructExternal( + Allocator.Allocate(), createAddressable(0, false), Name, Size); + ExternalSymbols.insert(&Sym); + return Sym; } - /// Add a defined atom to the graph. - /// - /// Allocates and constructs a DefinedAtom instance with the given parent, - /// name, address, and alignment. - DefinedAtom &addDefinedAtom(Section &Parent, StringRef Name, - JITTargetAddress Address, uint32_t Alignment) { - assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted"); - DefinedAtom *A = reinterpret_cast( - AtomAllocator.Allocate(sizeof(DefinedAtom), alignof(DefinedAtom))); - new (A) DefinedAtom(Parent, Name, Address, Alignment); - Parent.addAtom(*A); - getAddrToAtomMap()[A->getAddress()] = A; - NamedAtoms[Name] = A; - return *A; + /// Add an absolute symbol. + Symbol &addAbsoluteSymbol(StringRef Name, JITTargetAddress Address, + uint64_t Size, Linkage L, Scope S, bool IsLive) { + auto &Sym = Symbol::constructAbsolute(Allocator.Allocate(), + createAddressable(Address), Name, + Size, L, S, IsLive); + AbsoluteSymbols.insert(&Sym); + return Sym; } - /// Add a common symbol atom to the graph. - /// - /// Adds a common-symbol atom to the graph with the given parent, name, - /// address, alignment and size. - DefinedAtom &addCommonAtom(Section &Parent, StringRef Name, - JITTargetAddress Address, uint32_t Alignment, - uint64_t Size) { - assert(!NamedAtoms.count(Name) && "Duplicate named atom inserted"); - DefinedAtom *A = reinterpret_cast( - AtomAllocator.Allocate(sizeof(DefinedAtom), alignof(DefinedAtom))); - new (A) DefinedAtom(Parent, Name, Address, Alignment); - A->setCommon(Size); - Parent.addAtom(*A); - NamedAtoms[Name] = A; - return *A; + /// Convenience method for adding a weak zero-fill symbol. + Symbol &addCommonSymbol(StringRef Name, Scope S, Section &Section, + JITTargetAddress Address, uint64_t Size, + uint64_t Alignment, bool IsLive) { + auto &Sym = Symbol::constructCommon( + Allocator.Allocate(), + createBlock(Section, Section.getNextBlockOrdinal(), Address, Size, + Alignment, 0), + Name, Size, S, IsLive); + Section.addSymbol(Sym); + return Sym; + } + + /// Add an anonymous symbol. + Symbol &addAnonymousSymbol(Block &Content, JITTargetAddress Offset, + JITTargetAddress Size, bool IsCallable, + bool IsLive) { + auto &Sym = Symbol::constructAnonDef(Allocator.Allocate(), Content, + Offset, Size, IsCallable, IsLive); + Content.getSection().addSymbol(Sym); + return Sym; + } + + /// Add a named symbol. + Symbol &addDefinedSymbol(Block &Content, JITTargetAddress Offset, + StringRef Name, JITTargetAddress Size, Linkage L, + Scope S, bool IsCallable, bool IsLive) { + auto &Sym = + Symbol::constructNamedDef(Allocator.Allocate(), Content, Offset, + Name, Size, L, S, IsLive, IsCallable); + Content.getSection().addSymbol(Sym); + return Sym; } iterator_range sections() { @@ -638,135 +811,79 @@ class AtomGraph { return nullptr; } - iterator_range external_atoms() { - return make_range(ExternalAtoms.begin(), ExternalAtoms.end()); + iterator_range external_symbols() { + return make_range(ExternalSymbols.begin(), ExternalSymbols.end()); } - iterator_range absolute_atoms() { - return make_range(AbsoluteAtoms.begin(), AbsoluteAtoms.end()); + iterator_range absolute_symbols() { + return make_range(AbsoluteSymbols.begin(), AbsoluteSymbols.end()); } - iterator_range defined_atoms() { - return make_range(defined_atom_iterator(Sections.begin(), Sections.end()), - defined_atom_iterator(Sections.end(), Sections.end())); + iterator_range defined_symbols() { + return make_range(defined_symbol_iterator(Sections.begin(), Sections.end()), + defined_symbol_iterator(Sections.end(), Sections.end())); } - iterator_range defined_atoms() const { + iterator_range defined_symbols() const { return make_range( - const_defined_atom_iterator(Sections.begin(), Sections.end()), - const_defined_atom_iterator(Sections.end(), Sections.end())); - } - - /// Returns the atom with the given name, which must exist in this graph. - Atom &getAtomByName(StringRef Name) { - auto I = NamedAtoms.find(Name); - assert(I != NamedAtoms.end() && "Name not in NamedAtoms map"); - return *I->second; - } - - /// Returns the atom with the given name, which must exist in this graph and - /// be a DefinedAtom. - DefinedAtom &getDefinedAtomByName(StringRef Name) { - auto &A = getAtomByName(Name); - assert(A.isDefined() && "Atom is not a defined atom"); - return static_cast(A); - } - - /// Search for the given atom by name. - /// Returns the atom (if found) or an error (if no atom with this name - /// exists). - Expected findAtomByName(StringRef Name) { - auto I = NamedAtoms.find(Name); - if (I == NamedAtoms.end()) - return make_error("No atom named " + Name); - return *I->second; - } - - /// Search for the given defined atom by name. - /// Returns the defined atom (if found) or an error (if no atom with this - /// name exists, or if one exists but is not a defined atom). - Expected findDefinedAtomByName(StringRef Name) { - auto I = NamedAtoms.find(Name); - if (I == NamedAtoms.end()) - return make_error("No atom named " + Name); - if (!I->second->isDefined()) - return make_error("Atom " + Name + - " exists but is not a " - "defined atom"); - return static_cast(*I->second); - } - - /// Returns the atom covering the given address, or an error if no such atom - /// exists. - /// - /// Returns null if no atom exists at the given address. - DefinedAtom *getAtomByAddress(JITTargetAddress Address) { - refreshAddrToAtomCache(); - - // If there are no defined atoms, bail out early. - if (AddrToAtomCache->empty()) - return nullptr; - - // Find the atom *after* the given address. - auto I = AddrToAtomCache->upper_bound(Address); - - // If this address falls before any known atom, bail out. - if (I == AddrToAtomCache->begin()) - return nullptr; - - // The atom we're looking for is the one before the atom we found. - --I; - - // Otherwise range check the atom that was found. - assert(!I->second->getContent().empty() && "Atom content not set"); - if (Address >= I->second->getAddress() + I->second->getContent().size()) - return nullptr; + const_defined_symbol_iterator(Sections.begin(), Sections.end()), + const_defined_symbol_iterator(Sections.end(), Sections.end())); + } - return I->second; + iterator_range blocks() { + return make_range(Blocks.begin(), Blocks.end()); } - /// Like getAtomByAddress, but returns an Error if the given address is not - /// covered by an atom, rather than a null pointer. - Expected findAtomByAddress(JITTargetAddress Address) { - if (auto *DA = getAtomByAddress(Address)) - return *DA; - return make_error("No atom at address " + - formatv("{0:x16}", Address)); + /// Turn a defined symbol into an external one. + void makeExternal(Symbol &Sym) { + if (Sym.getAddressable().isAbsolute()) { + assert(AbsoluteSymbols.count(&Sym) && + "Sym is not in the absolute symbols set"); + AbsoluteSymbols.erase(&Sym); + } else { + assert(Sym.isDefined() && "Sym is not a defined symbol"); + Section &Sec = Sym.getBlock().getSection(); + Sec.removeSymbol(Sym); + } + Sym.makeExternal(createAddressable(false)); + ExternalSymbols.insert(&Sym); } - // Remove the given external atom from the graph. - void removeExternalAtom(Atom &A) { - assert(!A.isDefined() && !A.isAbsolute() && "A is not an external atom"); - assert(ExternalAtoms.count(&A) && "A is not in the external atoms set"); - ExternalAtoms.erase(&A); - A.~Atom(); + /// Removes an external symbol. Also removes the underlying Addressable. + void removeExternalSymbol(Symbol &Sym) { + assert(!Sym.isDefined() && !Sym.isAbsolute() && + "Sym is not an external symbol"); + assert(ExternalSymbols.count(&Sym) && "Symbol is not in the externals set"); + ExternalSymbols.erase(&Sym); + Addressable &Base = *Sym.Base; + destroySymbol(Sym); + destroyAddressable(Base); } - /// Remove the given absolute atom from the graph. - void removeAbsoluteAtom(Atom &A) { - assert(A.isAbsolute() && "A is not an absolute atom"); - assert(AbsoluteAtoms.count(&A) && "A is not in the absolute atoms set"); - AbsoluteAtoms.erase(&A); - A.~Atom(); + /// Remove an absolute symbol. Also removes the underlying Addressable. + void removeAbsoluteSymbol(Symbol &Sym) { + assert(!Sym.isDefined() && Sym.isAbsolute() && + "Sym is not an absolute symbol"); + assert(AbsoluteSymbols.count(&Sym) && + "Symbol is not in the absolute symbols set"); + AbsoluteSymbols.erase(&Sym); + Addressable &Base = *Sym.Base; + destroySymbol(Sym); + destroyAddressable(Base); } - /// Remove the given defined atom from the graph. - void removeDefinedAtom(DefinedAtom &DA) { - if (AddrToAtomCache) { - assert(AddrToAtomCache->count(DA.getAddress()) && - "Cache exists, but does not contain atom"); - AddrToAtomCache->erase(DA.getAddress()); - } - if (DA.hasName()) { - assert(NamedAtoms.count(DA.getName()) && "Named atom not in map"); - NamedAtoms.erase(DA.getName()); - } - DA.getSection().removeAtom(DA); - DA.~DefinedAtom(); + /// Removes defined symbols. Does not remove the underlying block. + void removeDefinedSymbol(Symbol &Sym) { + assert(Sym.isDefined() && "Sym is not a defined symbol"); + Sym.getBlock().getSection().removeSymbol(Sym); + destroySymbol(Sym); } - /// Invalidate the atom-to-address map. - void invalidateAddrToAtomMap() { AddrToAtomCache = None; } + /// Remove a block. + void removeBlock(Block &B) { + Blocks.erase(&B); + destroyBlock(B); + } /// Dump the graph. /// @@ -778,87 +895,84 @@ class AtomGraph { std::function()); private: - AddressToAtomMap &getAddrToAtomMap() { - refreshAddrToAtomCache(); - return *AddrToAtomCache; - } - - const AddressToAtomMap &getAddrToAtomMap() const { - refreshAddrToAtomCache(); - return *AddrToAtomCache; - } - - void refreshAddrToAtomCache() const { - if (!AddrToAtomCache) { - AddrToAtomCache = AddressToAtomMap(); - for (auto *DA : defined_atoms()) - (*AddrToAtomCache)[DA->getAddress()] = const_cast(DA); - } - } - - // Put the BumpPtrAllocator first so that we don't free any of the atoms in - // it until all of their destructors have been run. - BumpPtrAllocator AtomAllocator; + // Put the BumpPtrAllocator first so that we don't free any of the underlying + // memory until the Symbol/Addressable destructors have been run. + BumpPtrAllocator Allocator; std::string Name; unsigned PointerSize; support::endianness Endianness; + BlockSet Blocks; SectionList Sections; - NamedAtomMap NamedAtoms; - ExternalAtomSet ExternalAtoms; - ExternalAtomSet AbsoluteAtoms; - mutable Optional AddrToAtomCache; + ExternalSymbolSet ExternalSymbols; + ExternalSymbolSet AbsoluteSymbols; }; -/// A function for mutating AtomGraphs. -using AtomGraphPassFunction = std::function; +/// A function for mutating LinkGraphs. +using LinkGraphPassFunction = std::function; -/// A list of atom graph passes. -using AtomGraphPassList = std::vector; +/// A list of LinkGraph passes. +using LinkGraphPassList = std::vector; -/// An atom graph pass configuration, consisting of a list of pre-prune, +/// An LinkGraph pass configuration, consisting of a list of pre-prune, /// post-prune, and post-fixup passes. struct PassConfiguration { /// Pre-prune passes. /// /// These passes are called on the graph after it is built, and before any - /// atoms have been pruned. + /// symbols have been pruned. /// - /// Notable use cases: Marking atoms live or should-discard. - AtomGraphPassList PrePrunePasses; + /// Notable use cases: Marking symbols live or should-discard. + LinkGraphPassList PrePrunePasses; /// Post-prune passes. /// - /// These passes are called on the graph after dead and should-discard atoms - /// have been removed, but before fixups are applied. + /// These passes are called on the graph after dead stripping, but before + /// fixups are applied. /// - /// Notable use cases: Building GOT, stub, and TLV atoms. - AtomGraphPassList PostPrunePasses; + /// Notable use cases: Building GOT, stub, and TLV symbols. + LinkGraphPassList PostPrunePasses; /// Post-fixup passes. /// - /// These passes are called on the graph after atom contents has been copied + /// These passes are called on the graph after block contents has been copied /// to working memory, and fixups applied. /// /// Notable use cases: Testing and validation. - AtomGraphPassList PostFixupPasses; + LinkGraphPassList PostFixupPasses; }; /// A map of symbol names to resolved addresses. using AsyncLookupResult = DenseMap; -/// A function to call with a resolved symbol map (See AsyncLookupResult) or an -/// error if resolution failed. -using JITLinkAsyncLookupContinuation = - std::function LR)>; +/// A function object to call with a resolved symbol map (See AsyncLookupResult) +/// or an error if resolution failed. +class JITLinkAsyncLookupContinuation { +public: + virtual ~JITLinkAsyncLookupContinuation() {} + virtual void run(Expected LR) = 0; + +private: + virtual void anchor(); +}; + +/// Create a lookup continuation from a function object. +template +std::unique_ptr +createLookupContinuation(Continuation Cont) { -/// An asynchronous symbol lookup. Performs a search (possibly asynchronously) -/// for the given symbols, calling the given continuation with either the result -/// (if the lookup succeeds), or an error (if the lookup fails). -using JITLinkAsyncLookupFunction = - std::function &Symbols, - JITLinkAsyncLookupContinuation LookupContinuation)>; + class Impl final : public JITLinkAsyncLookupContinuation { + public: + Impl(Continuation C) : C(std::move(C)) {} + void run(Expected LR) override { C(std::move(LR)); } + + private: + Continuation C; + }; + + return std::make_unique(std::move(Cont)); +} /// Holds context for a single jitLink invocation. class JITLinkContext { @@ -881,13 +995,13 @@ class JITLinkContext { /// lookup continutation which it must call with a result to continue the /// linking process. virtual void lookup(const DenseSet &Symbols, - JITLinkAsyncLookupContinuation LookupContinuation) = 0; + std::unique_ptr LC) = 0; - /// Called by JITLink once all defined atoms in the graph have been assigned - /// their final memory locations in the target process. At this point he - /// atom graph can be, inspected to build a symbol table however the atom + /// Called by JITLink once all defined symbols in the graph have been assigned + /// their final memory locations in the target process. At this point the + /// LinkGraph can be inspected to build a symbol table, however the block /// content will not generally have been copied to the target location yet. - virtual void notifyResolved(AtomGraph &G) = 0; + virtual void notifyResolved(LinkGraph &G) = 0; /// Called by JITLink to notify the context that the object has been /// finalized (i.e. emitted to memory and memory permissions set). If all of @@ -904,20 +1018,20 @@ class JITLinkContext { /// Returns the mark-live pass to be used for this link. If no pass is /// returned (the default) then the target-specific linker implementation will - /// choose a conservative default (usually marking all atoms live). + /// choose a conservative default (usually marking all symbols live). /// This function is only called if shouldAddDefaultTargetPasses returns true, /// otherwise the JITContext is responsible for adding a mark-live pass in /// modifyPassConfig. - virtual AtomGraphPassFunction getMarkLivePass(const Triple &TT) const; + virtual LinkGraphPassFunction getMarkLivePass(const Triple &TT) const; /// Called by JITLink to modify the pass pipeline prior to linking. /// The default version performs no modification. virtual Error modifyPassConfig(const Triple &TT, PassConfiguration &Config); }; -/// Marks all atoms in a graph live. This can be used as a default, conservative -/// mark-live implementation. -Error markAllAtomsLive(AtomGraph &G); +/// Marks all symbols in a graph live. This can be used as a default, +/// conservative mark-live implementation. +Error markAllSymbolsLive(LinkGraph &G); /// Basic JITLink implementation. /// diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h index 9d0b37fe4a4d71..ac5a593bb77baa 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h @@ -33,20 +33,19 @@ class JITLinkMemoryManager { class SegmentRequest { public: SegmentRequest() = default; - SegmentRequest(size_t ContentSize, unsigned ContentAlign, - uint64_t ZeroFillSize, unsigned ZeroFillAlign) - : ContentSize(ContentSize), ZeroFillSize(ZeroFillSize), - ContentAlign(ContentAlign), ZeroFillAlign(ZeroFillAlign) {} + SegmentRequest(uint64_t Alignment, size_t ContentSize, + uint64_t ZeroFillSize) + : Alignment(Alignment), ContentSize(ContentSize), + ZeroFillSize(ZeroFillSize) { + assert(isPowerOf2_32(Alignment) && "Alignment must be power of 2"); + } + uint64_t getAlignment() const { return Alignment; } size_t getContentSize() const { return ContentSize; } - unsigned getContentAlignment() const { return ContentAlign; } uint64_t getZeroFillSize() const { return ZeroFillSize; } - unsigned getZeroFillAlignment() const { return ZeroFillAlign; } - private: + uint64_t Alignment = 0; size_t ContentSize = 0; uint64_t ZeroFillSize = 0; - unsigned ContentAlign = 0; - unsigned ZeroFillAlign = 0; }; using SegmentsRequestMap = DenseMap; diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index fa053fa70291ed..c6981506624954 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1649,7 +1649,7 @@ class IRBuilder : public IRBuilderBase, public Inserter { StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false) { StoreInst *SI = CreateStore(Val, Ptr, isVolatile); - SI->setAlignment(Align); + SI->setAlignment(MaybeAlign(Align)); return SI; } diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 71cf9fc38d8313..eaaf5064646208 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -374,8 +374,6 @@ class StoreInst : public Instruction { return 0; } - // FIXME: Remove once migration to Align is over. - void setAlignment(unsigned Align); void setAlignment(MaybeAlign Align); /// Returns the ordering constraint of this store instruction. diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index f415336119ffa3..9400f0a0801e86 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -273,12 +273,16 @@ namespace llvm { case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_maxnum: case Intrinsic::experimental_constrained_minnum: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: return true; diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index c33f66a2448f74..9e4ebd915afced 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -101,7 +101,7 @@ namespace Intrinsic { Argument, ExtendArgument, TruncArgument, HalfVecArgument, SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfAnyPtrsToElt, VecElementArgument, ScalableVecArgument, Subdivide2Argument, - Subdivide4Argument + Subdivide4Argument, VecOfBitcastsToInt } Kind; union { @@ -127,7 +127,8 @@ namespace Intrinsic { Kind == TruncArgument || Kind == HalfVecArgument || Kind == SameVecWidthArgument || Kind == PtrToArgument || Kind == PtrToElt || Kind == VecElementArgument || - Kind == Subdivide2Argument || Kind == Subdivide4Argument); + Kind == Subdivide2Argument || Kind == Subdivide4Argument || + Kind == VecOfBitcastsToInt); return Argument_Info >> 3; } ArgKind getArgumentKind() const { @@ -135,7 +136,7 @@ namespace Intrinsic { Kind == TruncArgument || Kind == HalfVecArgument || Kind == SameVecWidthArgument || Kind == PtrToArgument || Kind == VecElementArgument || Kind == Subdivide2Argument || - Kind == Subdivide4Argument); + Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt); return (ArgKind)(Argument_Info & 7); } diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 0ee06267a3b7f2..e764ad4e566eba 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -193,6 +193,10 @@ class LLVMHalfElementsVectorType : LLVMMatchType; class LLVMSubdivide2VectorType : LLVMMatchType; class LLVMSubdivide4VectorType : LLVMMatchType; +// Match the element count and bit width of another intrinsic parameter, but +// change the element type to an integer. +class LLVMVectorOfBitcastsToInt : LLVMMatchType; + def llvm_void_ty : LLVMType; let isAny = 1 in { def llvm_any_ty : LLVMType; @@ -699,6 +703,14 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in { [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_lrint : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_llrint : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_maxnum : Intrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, @@ -717,6 +729,12 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn] in { [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_lround : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_llround : Intrinsic<[ llvm_anyint_ty ], + [ llvm_anyfloat_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_round : Intrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, llvm_metadata_ty, diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 2a69a51603f0ec..e73f5b8b2b0062 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -768,6 +768,13 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". LLVMMatchType<0>], [IntrNoMem]>; + class AdvSIMD_SVE_CNT_Intrinsic + : Intrinsic<[LLVMVectorOfBitcastsToInt<0>], + [LLVMVectorOfBitcastsToInt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyvector_ty], + [IntrNoMem]>; + class AdvSIMD_SVE_PUNPKHI_Intrinsic : Intrinsic<[LLVMHalfElementsVectorType<0>], [llvm_anyvector_ty], @@ -792,6 +799,12 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". def int_aarch64_sve_abs : AdvSIMD_Merged1VectorArg_Intrinsic; def int_aarch64_sve_neg : AdvSIMD_Merged1VectorArg_Intrinsic; +// +// Counting bits +// + +def int_aarch64_sve_cnt : AdvSIMD_SVE_CNT_Intrinsic; + // // Floating-point comparisons // diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 4e63192d783eec..c1826b0add83f7 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -943,7 +943,10 @@ class AMDGPURawBufferLoad : Intrinsic < [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<3>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -955,7 +958,10 @@ class AMDGPUStructBufferLoad : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -967,7 +973,10 @@ class AMDGPURawBufferStore : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -980,7 +989,10 @@ class AMDGPUStructBufferStore : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -1094,7 +1106,10 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1105,7 +1120,10 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1116,7 +1134,10 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1128,7 +1149,10 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<5>, ImmArg<6>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h index 03d2ce4d7f88f2..58502907f0e352 100644 --- a/llvm/include/llvm/IR/Value.h +++ b/llvm/include/llvm/IR/Value.h @@ -523,6 +523,16 @@ class Value { static_cast(this)->stripPointerCasts()); } + /// Strip off pointer casts, all-zero GEPs, address space casts, and aliases. + /// + /// Returns the original uncasted value. If this is called on a non-pointer + /// value, it returns 'this'. + const Value *stripPointerCastsAndAliases() const; + Value *stripPointerCastsAndAliases() { + return const_cast( + static_cast(this)->stripPointerCastsAndAliases()); + } + /// Strip off pointer casts, all-zero GEPs and address space casts /// but ensures the representation of the result stays the same. /// diff --git a/llvm/include/llvm/MC/MCInstrDesc.h b/llvm/include/llvm/MC/MCInstrDesc.h index b119d90f6513f5..e75a27614a22d5 100644 --- a/llvm/include/llvm/MC/MCInstrDesc.h +++ b/llvm/include/llvm/MC/MCInstrDesc.h @@ -129,7 +129,8 @@ namespace MCID { /// not use these directly. These all correspond to bitfields in the /// MCInstrDesc::Flags field. enum Flag { - Variadic = 0, + PreISelOpcode = 0, + Variadic, HasOptionalDef, Pseudo, Return, @@ -242,6 +243,10 @@ class MCInstrDesc { /// Return flags of this instruction. uint64_t getFlags() const { return Flags; } + /// \returns true if this instruction is emitted before instruction selection + /// and should be legalized/regbankselected/selected. + bool isPreISelOpcode() const { return Flags & (1ULL << MCID::PreISelOpcode); } + /// Return true if this instruction can have a variable number of /// operands. In this case, the variable operands will be after the normal /// operands but before the implicit definitions and uses (if any are diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h index 61bd4de092ac68..ef2b4fba031d12 100644 --- a/llvm/include/llvm/ObjectYAML/ELFYAML.h +++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h @@ -137,7 +137,8 @@ struct Section { StackSizes, SymtabShndxSection, Symver, - MipsABIFlags + MipsABIFlags, + Addrsig }; SectionKind Kind; StringRef Name; @@ -223,6 +224,7 @@ struct NoBitsSection : Section { struct HashSection : Section { Optional Content; + Optional Size; Optional> Bucket; Optional> Chain; @@ -255,6 +257,26 @@ struct VerneedSection : Section { } }; +struct AddrsigSymbol { + AddrsigSymbol(StringRef N) : Name(N), Index(None) {} + AddrsigSymbol(llvm::yaml::Hex32 Ndx) : Name(None), Index(Ndx) {} + AddrsigSymbol() : Name(None), Index(None) {} + + Optional Name; + Optional Index; +}; + +struct AddrsigSection : Section { + Optional Content; + Optional Size; + Optional> Symbols; + + AddrsigSection() : Section(SectionKind::Addrsig) {} + static bool classof(const Section *S) { + return S->Kind == SectionKind::Addrsig; + } +}; + struct SymverSection : Section { std::vector Entries; @@ -361,6 +383,7 @@ struct Object { } // end namespace ELFYAML } // end namespace llvm +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::AddrsigSymbol) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ProgramHeader) @@ -517,6 +540,10 @@ template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::VernauxEntry &E); }; +template <> struct MappingTraits { + static void mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym); +}; + template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::Relocation &Rel); }; diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index 9ad740ed804eb5..936ebcecfe96d4 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -145,11 +145,25 @@ static inline std::string getSecName(SecType Type) { // and SampleProfileExtBinaryBaseWriter. struct SecHdrTableEntry { SecType Type; - uint64_t Flag; + uint64_t Flags; uint64_t Offset; uint64_t Size; }; +enum SecFlags { SecFlagInValid = 0, SecFlagCompress = (1 << 0) }; + +static inline void addSecFlags(SecHdrTableEntry &Entry, uint64_t Flags) { + Entry.Flags |= Flags; +} + +static inline void removeSecFlags(SecHdrTableEntry &Entry, uint64_t Flags) { + Entry.Flags &= ~Flags; +} + +static inline bool hasSecFlag(SecHdrTableEntry &Entry, SecFlags Flag) { + return Entry.Flags & Flag; +} + /// Represents the relative location of an instruction. /// /// Instruction locations are specified by the line offset from the @@ -643,9 +657,9 @@ class ProfileSymbolList { unsigned size() { return Syms.size(); } void setToCompress(bool TC) { ToCompress = TC; } + bool toCompress() { return ToCompress; } - std::error_code read(uint64_t CompressSize, uint64_t UncompressSize, - const uint8_t *Data); + std::error_code read(const uint8_t *Data, uint64_t ListSize); std::error_code write(raw_ostream &OS); void dump(raw_ostream &OS = dbgs()) const; diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h index 761dbde059569f..424818bbb26df8 100644 --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -488,6 +488,14 @@ class SampleProfileReaderRawBinary : public SampleProfileReaderBinary { /// possible to define other types of profile inherited from /// SampleProfileReaderExtBinaryBase/SampleProfileWriterExtBinaryBase. class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary { +private: + std::error_code decompressSection(const uint8_t *SecStart, + const uint64_t SecSize, + const uint8_t *&DecompressBuf, + uint64_t &DecompressBufSize); + + BumpPtrAllocator Allocator; + protected: std::vector SecHdrTable; std::unique_ptr ProfSymList; @@ -518,7 +526,7 @@ class SampleProfileReaderExtBinary : public SampleProfileReaderExtBinaryBase { virtual std::error_code verifySPMagic(uint64_t Magic) override; virtual std::error_code readOneSection(const uint8_t *Start, uint64_t Size, SecType Type) override; - std::error_code readProfileSymbolList(); + std::error_code readProfileSymbolList(uint64_t Size); public: SampleProfileReaderExtBinary(std::unique_ptr B, LLVMContext &C, diff --git a/llvm/include/llvm/ProfileData/SampleProfWriter.h b/llvm/include/llvm/ProfileData/SampleProfWriter.h index 35218e3879c4a3..ce60baf66c65c9 100644 --- a/llvm/include/llvm/ProfileData/SampleProfWriter.h +++ b/llvm/include/llvm/ProfileData/SampleProfWriter.h @@ -143,14 +143,16 @@ class SampleProfileWriterRawBinary : public SampleProfileWriterBinary { class SampleProfileWriterExtBinaryBase : public SampleProfileWriterBinary { using SampleProfileWriterBinary::SampleProfileWriterBinary; - public: virtual std::error_code write(const StringMap &ProfileMap) override; + void setToCompressAllSections(); + void setToCompressSection(SecType Type); + protected: - uint64_t markSectionStart(); - uint64_t addNewSection(SecType Sec, uint64_t SectionStart); + uint64_t markSectionStart(SecType Type); + std::error_code addNewSection(SecType Sec, uint64_t SectionStart); virtual void initSectionLayout() = 0; virtual std::error_code writeSections(const StringMap &ProfileMap) = 0; @@ -158,34 +160,52 @@ class SampleProfileWriterExtBinaryBase : public SampleProfileWriterBinary { // Specifiy the section layout in the profile. Note that the order in // SecHdrTable (order to collect sections) may be different from the // order in SectionLayout (order to write out sections into profile). - SmallVector SectionLayout; + SmallVector SectionLayout; private: void allocSecHdrTable(); std::error_code writeSecHdrTable(); virtual std::error_code writeHeader(const StringMap &ProfileMap) override; - + void addSectionFlags(SecType Type, SecFlags Flags); + SecHdrTableEntry &getEntryInLayout(SecType Type); + std::error_code compressAndOutput(); + + // We will swap the raw_ostream held by LocalBufStream and that + // held by OutputStream if we try to add a section which needs + // compression. After the swap, all the data written to output + // will be temporarily buffered into the underlying raw_string_ostream + // originally held by LocalBufStream. After the data writing for the + // section is completed, compress the data in the local buffer, + // swap the raw_ostream back and write the compressed data to the + // real output. + std::unique_ptr LocalBufStream; // The location where the output stream starts. uint64_t FileStart; // The location in the output stream where the SecHdrTable should be // written to. uint64_t SecHdrTableOffset; + // Initial Section Flags setting. std::vector SecHdrTable; }; class SampleProfileWriterExtBinary : public SampleProfileWriterExtBinaryBase { - using SampleProfileWriterExtBinaryBase::SampleProfileWriterExtBinaryBase; - public: + SampleProfileWriterExtBinary(std::unique_ptr &OS) + : SampleProfileWriterExtBinaryBase(OS) { + initSectionLayout(); + } + virtual void setProfileSymbolList(ProfileSymbolList *PSL) override { ProfSymList = PSL; }; private: virtual void initSectionLayout() override { - SectionLayout = {SecProfSummary, SecNameTable, SecLBRProfile, - SecProfileSymbolList}; + SectionLayout = {{SecProfSummary, 0, 0, 0}, + {SecNameTable, 0, 0, 0}, + {SecLBRProfile, 0, 0, 0}, + {SecProfileSymbolList, 0, 0, 0}}; }; virtual std::error_code writeSections(const StringMap &ProfileMap) override; diff --git a/llvm/include/llvm/Support/Automaton.h b/llvm/include/llvm/Support/Automaton.h new file mode 100644 index 00000000000000..5fe0824017dd97 --- /dev/null +++ b/llvm/include/llvm/Support/Automaton.h @@ -0,0 +1,229 @@ +//===-- Automaton.h - Support for driving TableGen-produced DFAs ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements class that drive and introspect deterministic finite- +// state automata (DFAs) as generated by TableGen's -gen-automata backend. +// +// For a description of how to define an automaton, see +// include/llvm/TableGen/Automaton.td. +// +// One important detail is that these deterministic automata are created from +// (potentially) nondeterministic definitions. Therefore a unique sequence of +// input symbols will produce one path through the DFA but multiple paths +// through the original NFA. An automaton by default only returns "accepted" or +// "not accepted", but frequently we want to analyze what NFA path was taken. +// Finding a path through the NFA states that results in a DFA state can help +// answer *what* the solution to a problem was, not just that there exists a +// solution. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_AUTOMATON_H +#define LLVM_SUPPORT_AUTOMATON_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Allocator.h" +#include +#include +#include +#include +#include + +namespace llvm { + +using NfaPath = SmallVector; + +/// Forward define the pair type used by the automata transition info tables. +/// +/// Experimental results with large tables have shown a significant (multiple +/// orders of magnitude) parsing speedup by using a custom struct here with a +/// trivial constructor rather than std::pair. +struct NfaStatePair { + uint64_t FromDfaState, ToDfaState; + + bool operator<(const NfaStatePair &Other) const { + return std::make_tuple(FromDfaState, ToDfaState) < + std::make_tuple(Other.FromDfaState, Other.ToDfaState); + } +}; + +namespace internal { +/// The internal class that maintains all possible paths through an NFA based +/// on a path through the DFA. +class NfaTranscriber { +private: + /// Cached transition table. This is a table of NfaStatePairs that contains + /// zero-terminated sequences pointed to by DFA transitions. + ArrayRef TransitionInfo; + + /// A simple linked-list of traversed states that can have a shared tail. The + /// traversed path is stored in reverse order with the latest state as the + /// head. + struct PathSegment { + uint64_t State; + PathSegment *Tail; + }; + + /// We allocate segment objects frequently. Allocate them upfront and dispose + /// at the end of a traversal rather than hammering the system allocator. + SpecificBumpPtrAllocator Allocator; + + /// Heads of each tracked path. These are not ordered. + std::deque Heads; + + /// The returned paths. This is populated during getPaths. + SmallVector Paths; + + /// Create a new segment and return it. + PathSegment *makePathSegment(uint64_t State, PathSegment *Tail) { + PathSegment *P = Allocator.Allocate(); + *P = {State, Tail}; + return P; + } + + /// Pairs defines a sequence of possible NFA transitions for a single DFA + /// transition. + void transition(ArrayRef Pairs) { + // Iterate over all existing heads. We will mutate the Heads deque during + // iteration. + unsigned NumHeads = Heads.size(); + for (unsigned I = 0; I < NumHeads; ++I) { + PathSegment *Head = Heads[I]; + // The sequence of pairs is sorted. Select the set of pairs that + // transition from the current head state. + auto PI = lower_bound(Pairs, NfaStatePair{Head->State, 0ULL}); + auto PE = upper_bound(Pairs, NfaStatePair{Head->State, INT64_MAX}); + // For every transition from the current head state, add a new path + // segment. + for (; PI != PE; ++PI) + if (PI->FromDfaState == Head->State) + Heads.push_back(makePathSegment(PI->ToDfaState, Head)); + } + // Now we've iterated over all the initial heads and added new ones, + // dispose of the original heads. + Heads.erase(Heads.begin(), std::next(Heads.begin(), NumHeads)); + } + +public: + NfaTranscriber(ArrayRef TransitionInfo) + : TransitionInfo(TransitionInfo) { + reset(); + } + + void reset() { + Paths.clear(); + Heads.clear(); + Allocator.DestroyAll(); + // The initial NFA state is 0. + Heads.push_back(makePathSegment(0ULL, nullptr)); + } + + void transition(unsigned TransitionInfoIdx) { + unsigned EndIdx = TransitionInfoIdx; + while (TransitionInfo[EndIdx].ToDfaState != 0) + ++EndIdx; + ArrayRef Pairs(&TransitionInfo[TransitionInfoIdx], + EndIdx - TransitionInfoIdx); + transition(Pairs); + } + + ArrayRef getPaths() { + Paths.clear(); + for (auto *Head : Heads) { + NfaPath P; + while (Head->State != 0) { + P.push_back(Head->State); + Head = Head->Tail; + } + std::reverse(P.begin(), P.end()); + Paths.push_back(std::move(P)); + } + return Paths; + } +}; +} // namespace internal + +/// A deterministic finite-state automaton. The automaton is defined in +/// TableGen; this object drives an automaton defined by tblgen-emitted tables. +/// +/// An automaton accepts a sequence of input tokens ("actions"). This class is +/// templated on the type of these actions. +template class Automaton { + /// Map from {State, Action} to {NewState, TransitionInfoIdx}. + /// TransitionInfoIdx is used by the DfaTranscriber to analyze the transition. + /// FIXME: This uses a std::map because ActionT can be a pair type including + /// an enum. In particular DenseMapInfo must be defined to use + /// DenseMap here. + std::map, std::pair> M; + /// An optional transcription object. This uses much more state than simply + /// traversing the DFA for acceptance, so is heap allocated. + std::unique_ptr Transcriber; + /// The initial DFA state is 1. + uint64_t State = 1; + +public: + /// Create an automaton. + /// \param Transitions The Transitions table as created by TableGen. Note that + /// because the action type differs per automaton, the + /// table type is templated as ArrayRef. + /// \param TranscriptionTable The TransitionInfo table as created by TableGen. + /// + /// Providing the TranscriptionTable argument as non-empty will enable the + /// use of transcription, which analyzes the possible paths in the original + /// NFA taken by the DFA. NOTE: This is substantially more work than simply + /// driving the DFA, so unless you require the getPaths() method leave this + /// empty. + template + Automaton(ArrayRef Transitions, + ArrayRef TranscriptionTable = {}) { + if (!TranscriptionTable.empty()) + Transcriber = + std::make_unique(TranscriptionTable); + for (const auto &I : Transitions) + // Greedily read and cache the transition table. + M.emplace(std::make_pair(I.FromDfaState, I.Action), + std::make_pair(I.ToDfaState, I.InfoIdx)); + } + + /// Reset the automaton to its initial state. + void reset() { + State = 1; + if (Transcriber) + Transcriber->reset(); + } + + /// Transition the automaton based on input symbol A. Return true if the + /// automaton transitioned to a valid state, false if the automaton + /// transitioned to an invalid state. + /// + /// If this function returns false, all methods are undefined until reset() is + /// called. + bool add(const ActionT &A) { + auto I = M.find({State, A}); + if (I == M.end()) + return false; + if (Transcriber) + Transcriber->transition(I->second.second); + State = I->second.first; + return true; + } + + /// Obtain a set of possible paths through the input nondeterministic + /// automaton that could be obtained from the sequence of input actions + /// presented to this deterministic automaton. + ArrayRef getNfaPaths() { + assert(Transcriber && "Can only obtain NFA paths if transcribing!"); + return Transcriber->getPaths(); + } +}; + +} // namespace llvm + +#endif // LLVM_SUPPORT_AUTOMATON_H diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h index 3cc2c3c0121b2b..63784463e17185 100644 --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -2000,6 +2000,9 @@ void ResetAllOptionOccurrences(); /// where no options are supported. void ResetCommandLineParser(); +/// Parses `Arg` into the option handler `Handler`. +bool ProvidePositionalOption(Option *Handler, StringRef Arg, int i); + } // end namespace cl } // end namespace llvm diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h index 7707306a9be6c7..a29a9d787947f7 100644 --- a/llvm/include/llvm/Support/FileSystem.h +++ b/llvm/include/llvm/Support/FileSystem.h @@ -1215,9 +1215,9 @@ class directory_entry { // that whole structure, callers end up paying for a stat(). // std::filesystem::directory_entry may be a better model. std::string Path; - file_type Type; // Most platforms can provide this. - bool FollowSymlinks; // Affects the behavior of status(). - basic_file_status Status; // If available. + file_type Type = file_type::type_unknown; // Most platforms can provide this. + bool FollowSymlinks = true; // Affects the behavior of status(). + basic_file_status Status; // If available. public: explicit directory_entry(const Twine &Path, bool FollowSymlinks = true, diff --git a/llvm/include/llvm/TableGen/Automaton.td b/llvm/include/llvm/TableGen/Automaton.td new file mode 100644 index 00000000000000..13ced2a0e78404 --- /dev/null +++ b/llvm/include/llvm/TableGen/Automaton.td @@ -0,0 +1,95 @@ +//===- Automaton.td ----------------------------------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the key top-level classes needed to produce a reasonably +// generic finite-state automaton. +// +//===----------------------------------------------------------------------===// + +// Define a record inheriting from GenericAutomaton to generate a reasonably +// generic finite-state automaton over a set of actions and states. +// +// This automaton is defined by: +// 1) a state space (explicit, always bits<32>). +// 2) a set of input symbols (actions, explicit) and +// 3) a transition function from state + action -> state. +// +// A theoretical automaton is defined by : +// Q: A set of possible states. +// S: (sigma) The input alphabet. +// d: (delta) The transition function f(q in Q, s in S) -> q' in Q. +// F: The set of final (accepting) states. +// +// Because generating all possible states is tedious, we instead define the +// transition function only and crawl all reachable states starting from the +// initial state with all inputs under all transitions until termination. +// +// We define F = S, that is, all valid states are accepting. +// +// To ensure the generation of the automaton terminates, the state transitions +// are defined as a lattice (meaning every transitioned-to state is more +// specific than the transitioned-from state, for some definition of specificity). +// Concretely a transition may set one or more bits in the state that were +// previously zero to one. If any bit was not zero, the transition is invalid. +// +// Instead of defining all possible states (which would be cumbersome), the user +// provides a set of possible Transitions from state A, consuming an input +// symbol A to state B. The Transition object transforms state A to state B and +// acts as a predicate. This means the state space can be discovered by crawling +// all the possible transitions until none are valid. +// +// This automaton is considered to be nondeterministic, meaning that multiple +// transitions can occur from any (state, action) pair. The generated automaton +// is determinized, meaning that is executes in O(k) time where k is the input +// sequence length. +// +// In addition to a generated automaton that determines if a sequence of inputs +// is accepted or not, a table is emitted that allows determining a plausible +// sequence of states traversed to accept that input. +class GenericAutomaton { + // Name of a class that inherits from Transition. All records inheriting from + // this class will be considered when constructing the automaton. + string TransitionClass; + + // Names of fields within TransitionClass that define the action symbol. This + // defines the action as an N-tuple. + // + // Each symbol field can be of class, int, string or code type. + // If the type of a field is a class, the Record's name is used verbatim + // in C++ and the class name is used as the C++ type name. + // If the type of a field is a string, code or int, that is also used + // verbatim in C++. + // + // To override the C++ type name for field F, define a field called TypeOf_F. + // This should be a string that will be used verbatim in C++. + // + // As an example, to define a 2-tuple with an enum and a string, one might: + // def MyTransition : Transition { + // MyEnum S1; + // int S2; + // } + // def MyAutomaton : GenericAutomaton }{ + // let TransitionClass = "Transition"; + // let SymbolFields = ["S1", "S2"]; + // let TypeOf_S1 = "MyEnumInCxxKind"; + // } + list SymbolFields; +} + +// All transitions inherit from Transition. +class Transition { + // A transition S' = T(S) is valid if, for every set bit in NewState, the + // corresponding bit in S is clear. That is: + // def T(S): + // S' = S | NewState + // return S' if S' != S else Failure + // + // The automaton generator uses this property to crawl the set of possible + // transitions from a starting state of 0b0. + bits<32> NewState; +} diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index ad32b9fab75d38..4b49dfd4dd18e8 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -15,7 +15,9 @@ // Unary ops. //------------------------------------------------------------------------------ -class GenericInstruction : StandardPseudoInstruction; +class GenericInstruction : StandardPseudoInstruction { + let isPreISelOpcode = 1; +} // Extend the underlying scalar type of an operation, leaving the high bits // unspecified. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td new file mode 100644 index 00000000000000..065e28eca8a699 --- /dev/null +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -0,0 +1,17 @@ +//===- Combine.td - Combine rule definitions ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declare GlobalISel combine rules and provide mechanisms to opt-out. +// +//===----------------------------------------------------------------------===// + +// Declares a combiner helper class +class GICombinerHelper { + // The class name to use in the generated output. + string Classname = classname; +} diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index 93a6135928828d..dd8679661b9aca 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -492,6 +492,10 @@ class Instruction : InstructionEncoding { // Added complexity passed onto matching pattern. int AddedComplexity = 0; + // Indicates if this is a pre-isel opcode that should be + // legalized/regbankselected/selected. + bit isPreISelOpcode = 0; + // These bits capture information about the high-level semantics of the // instruction. bit isReturn = 0; // Is this instruction a return instruction? diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 8856cf003af006..441f3d7d118d1e 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -506,12 +506,20 @@ def strict_flog2 : SDNode<"ISD::STRICT_FLOG2", SDTFPUnaryOp, [SDNPHasChain]>; def strict_frint : SDNode<"ISD::STRICT_FRINT", SDTFPUnaryOp, [SDNPHasChain]>; +def strict_lrint : SDNode<"ISD::STRICT_LRINT", + SDTFPToIntOp, [SDNPHasChain]>; +def strict_llrint : SDNode<"ISD::STRICT_LLRINT", + SDTFPToIntOp, [SDNPHasChain]>; def strict_fnearbyint : SDNode<"ISD::STRICT_FNEARBYINT", SDTFPUnaryOp, [SDNPHasChain]>; def strict_fceil : SDNode<"ISD::STRICT_FCEIL", SDTFPUnaryOp, [SDNPHasChain]>; def strict_ffloor : SDNode<"ISD::STRICT_FFLOOR", SDTFPUnaryOp, [SDNPHasChain]>; +def strict_lround : SDNode<"ISD::STRICT_LROUND", + SDTFPToIntOp, [SDNPHasChain]>; +def strict_llround : SDNode<"ISD::STRICT_LLROUND", + SDTFPToIntOp, [SDNPHasChain]>; def strict_fround : SDNode<"ISD::STRICT_FROUND", SDTFPUnaryOp, [SDNPHasChain]>; def strict_ftrunc : SDNode<"ISD::STRICT_FTRUNC", @@ -1339,6 +1347,12 @@ def any_flog2 : PatFrags<(ops node:$src), def any_frint : PatFrags<(ops node:$src), [(strict_frint node:$src), (frint node:$src)]>; +def any_lrint : PatFrags<(ops node:$src), + [(strict_lrint node:$src), + (lrint node:$src)]>; +def any_llrint : PatFrags<(ops node:$src), + [(strict_llrint node:$src), + (llrint node:$src)]>; def any_fnearbyint : PatFrags<(ops node:$src), [(strict_fnearbyint node:$src), (fnearbyint node:$src)]>; @@ -1348,6 +1362,12 @@ def any_fceil : PatFrags<(ops node:$src), def any_ffloor : PatFrags<(ops node:$src), [(strict_ffloor node:$src), (ffloor node:$src)]>; +def any_lround : PatFrags<(ops node:$src), + [(strict_lround node:$src), + (lround node:$src)]>; +def any_llround : PatFrags<(ops node:$src), + [(strict_llround node:$src), + (llround node:$src)]>; def any_fround : PatFrags<(ops node:$src), [(strict_fround node:$src), (fround node:$src)]>; diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h index b30d098acabe08..22435e4ed1e5b2 100644 --- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h +++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h @@ -251,7 +251,7 @@ void runWholeProgramDevirtOnIndex( /// devirt target names for any locals that were exported. void updateIndexWPDForExports( ModuleSummaryIndex &Summary, - StringMap &ExportLists, + function_ref isExported, std::map> &LocalWPDTargetsMap); } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h index 9d79ee1633f689..74584bce910a72 100644 --- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h +++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h @@ -106,11 +106,19 @@ class Value; /// returns false. Function *extractCodeRegion(); + /// Verify that assumption cache isn't stale after a region is extracted. + /// Returns false when verifier finds errors. AssumptionCache is passed as + /// parameter to make this function stateless. + static bool verifyAssumptionCache(const Function& F, AssumptionCache *AC); + /// Test whether this code extractor is eligible. /// /// Based on the blocks used when constructing the code extractor, /// determine whether it is eligible for extraction. - bool isEligible() const { return !Blocks.empty(); } + /// + /// Checks that varargs handling (with vastart and vaend) is only done in + /// the outlined blocks. + bool isEligible() const; /// Compute the set of input values and output values for the code. /// diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index 5c5e0aef34cdf4..9fcb2f64d79b83 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -412,8 +412,7 @@ void removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU = nullptr); /// Remove all blocks that can not be reached from the function's entry. /// /// Returns true if any basic block was removed. -bool removeUnreachableBlocks(Function &F, LazyValueInfo *LVI = nullptr, - DomTreeUpdater *DTU = nullptr, +bool removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU = nullptr, MemorySSAUpdater *MSSAU = nullptr); /// Combine the metadata of two instructions so that K can replace J. Some diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h index b722c47c1cab6b..88c2ef787ad817 100644 --- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -181,6 +181,7 @@ class LibCallSimplifier { Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B); Value *optimizeRealloc(CallInst *CI, IRBuilder<> &B); Value *optimizeWcslen(CallInst *CI, IRBuilder<> &B); + Value *optimizeBCopy(CallInst *CI, IRBuilder<> &B); // Wrapper for all String/Memory Library Call Optimizations Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B); diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp index 7d6429a0fec1c0..129944743c5efe 100644 --- a/llvm/lib/Analysis/AssumptionCache.cpp +++ b/llvm/lib/Analysis/AssumptionCache.cpp @@ -130,7 +130,10 @@ void AssumptionCache::unregisterAssumption(CallInst *CI) { if (AVI != AffectedValues.end()) AffectedValues.erase(AVI); } - remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }); + + AssumeHandles.erase( + remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }), + AssumeHandles.end()); } void AssumptionCache::AffectedValueCallbackVH::deleted() { diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index a2c622e79056df..cb8987721700bc 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4592,6 +4592,9 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF, static Value *SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF, const SimplifyQuery &Q, unsigned MaxRecurse) { + if (Constant *C = simplifyFPOp({Op0, Op1})) + return C; + // fmul X, 1.0 ==> X if (match(Op1, m_FPOne())) return Op0; @@ -4626,9 +4629,6 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF, if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q)) return C; - if (Constant *C = simplifyFPOp({Op0, Op1})) - return C; - // Now apply simplifications that do not require rounding. return SimplifyFMAFMul(Op0, Op1, FMF, Q, MaxRecurse); } @@ -5186,6 +5186,15 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) { } return nullptr; } + case Intrinsic::fma: + case Intrinsic::fmuladd: { + Value *Op0 = Call->getArgOperand(0); + Value *Op1 = Call->getArgOperand(1); + Value *Op2 = Call->getArgOperand(2); + if (Value *V = simplifyFPOp({ Op0, Op1, Op2 })) + return V; + return nullptr; + } default: return nullptr; } diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp index cba8db49f0204d..ef31c1e0ba8ce8 100644 --- a/llvm/lib/Analysis/LazyCallGraph.cpp +++ b/llvm/lib/Analysis/LazyCallGraph.cpp @@ -632,7 +632,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall( // If the merge range is empty, then adding the edge didn't actually form any // new cycles. We're done. - if (empty(MergeRange)) { + if (MergeRange.empty()) { // Now that the SCC structure is finalized, flip the kind to call. SourceN->setEdgeKind(TargetN, Edge::Call); return false; // No new cycle. diff --git a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp index 7a12f597969ded..7de9d2cbfddba6 100644 --- a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp +++ b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -257,9 +257,8 @@ void DivergencePropagator::computeInfluenceRegion( void DivergencePropagator::exploreDataDependency(Value *V) { // Follow def-use chains of V. for (User *U : V->users()) { - Instruction *UserInst = cast(U); - if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second) - Worklist.push_back(UserInst); + if (!TTI.isAlwaysUniform(U) && DV.insert(U).second) + Worklist.push_back(U); } } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7f24a2c297a8e2..3d8f77675f3a2b 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1212,8 +1212,8 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, const SCEV *OffsetSCEVA = SE.getConstant(OffsetA); const SCEV *OffsetSCEVB = SE.getConstant(OffsetB); const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); - const SCEVConstant *OffsetDeltaC = dyn_cast(OffsetDeltaSCEV); - const APInt &OffsetDelta = OffsetDeltaC->getAPInt(); + const APInt &OffsetDelta = cast(OffsetDeltaSCEV)->getAPInt(); + // Check if they are based on the same pointer. That makes the offsets // sufficient. if (PtrA == PtrB) diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index 1dc63a3c3338fc..dbab5db7dbc2db 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -360,11 +360,17 @@ bool Loop::isAuxiliaryInductionVariable(PHINode &AuxIndVar, } BranchInst *Loop::getLoopGuardBranch() const { - assert(isLoopSimplifyForm() && "Only valid for loop in simplify form"); + if (!isLoopSimplifyForm()) + return nullptr; + BasicBlock *Preheader = getLoopPreheader(); - assert(Preheader && getLoopLatch() && + BasicBlock *Latch = getLoopLatch(); + assert(Preheader && Latch && "Expecting a loop with valid preheader and latch"); - assert(isLoopExiting(getLoopLatch()) && "Only valid for rotated loop"); + + // Loop should be in rotate form. + if (!isLoopExiting(Latch)) + return nullptr; // Disallow loops with more than one unique exit block, as we do not verify // that GuardOtherSucc post dominates all exit blocks. diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index cc3bca1c231821..cfb8b7e7dcb55e 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -873,6 +873,7 @@ template class ClobberWalker { if (!DefChainEnd) for (auto *MA : def_chain(const_cast(Target))) DefChainEnd = MA; + assert(DefChainEnd && "Failed to find dominating phi/liveOnEntry"); // If any of the terminated paths don't dominate the phi we'll try to // optimize, we need to figure out what they are and quit. diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp index e0b27f1d501b58..d103c3a8b831fe 100644 --- a/llvm/lib/Analysis/MemorySSAUpdater.cpp +++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp @@ -347,51 +347,54 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { // If this is the first def in the block and this insert is in an arbitrary // place, compute IDF and place phis. + SmallPtrSet DefiningBlocks; + + // If this is the last Def in the block, also compute IDF based on MD, since + // this may a new Def added, and we may need additional Phis. auto Iter = MD->getDefsIterator(); ++Iter; auto IterEnd = MSSA->getBlockDefs(MD->getBlock())->end(); - if (Iter == IterEnd) { - SmallPtrSet DefiningBlocks; + if (Iter == IterEnd) DefiningBlocks.insert(MD->getBlock()); - for (const auto &VH : InsertedPHIs) - if (const auto *RealPHI = cast_or_null(VH)) - DefiningBlocks.insert(RealPHI->getBlock()); - ForwardIDFCalculator IDFs(*MSSA->DT); - SmallVector IDFBlocks; - IDFs.setDefiningBlocks(DefiningBlocks); - IDFs.calculate(IDFBlocks); - SmallVector, 4> NewInsertedPHIs; - for (auto *BBIDF : IDFBlocks) { - auto *MPhi = MSSA->getMemoryAccess(BBIDF); - if (!MPhi) { - MPhi = MSSA->createMemoryPhi(BBIDF); - NewInsertedPHIs.push_back(MPhi); - } - // Add the phis created into the IDF blocks to NonOptPhis, so they are - // not optimized out as trivial by the call to getPreviousDefFromEnd - // below. Once they are complete, all these Phis are added to the - // FixupList, and removed from NonOptPhis inside fixupDefs(). Existing - // Phis in IDF may need fixing as well, and potentially be trivial - // before this insertion, hence add all IDF Phis. See PR43044. - NonOptPhis.insert(MPhi); + + for (const auto &VH : InsertedPHIs) + if (const auto *RealPHI = cast_or_null(VH)) + DefiningBlocks.insert(RealPHI->getBlock()); + ForwardIDFCalculator IDFs(*MSSA->DT); + SmallVector IDFBlocks; + IDFs.setDefiningBlocks(DefiningBlocks); + IDFs.calculate(IDFBlocks); + SmallVector, 4> NewInsertedPHIs; + for (auto *BBIDF : IDFBlocks) { + auto *MPhi = MSSA->getMemoryAccess(BBIDF); + if (!MPhi) { + MPhi = MSSA->createMemoryPhi(BBIDF); + NewInsertedPHIs.push_back(MPhi); } - for (auto &MPhi : NewInsertedPHIs) { - auto *BBIDF = MPhi->getBlock(); - for (auto *Pred : predecessors(BBIDF)) { - DenseMap> CachedPreviousDef; - MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef), - Pred); - } + // Add the phis created into the IDF blocks to NonOptPhis, so they are not + // optimized out as trivial by the call to getPreviousDefFromEnd below. + // Once they are complete, all these Phis are added to the FixupList, and + // removed from NonOptPhis inside fixupDefs(). Existing Phis in IDF may + // need fixing as well, and potentially be trivial before this insertion, + // hence add all IDF Phis. See PR43044. + NonOptPhis.insert(MPhi); + } + for (auto &MPhi : NewInsertedPHIs) { + auto *BBIDF = MPhi->getBlock(); + for (auto *Pred : predecessors(BBIDF)) { + DenseMap> CachedPreviousDef; + MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef), Pred); } + } - // Re-take the index where we're adding the new phis, because the above - // call to getPreviousDefFromEnd, may have inserted into InsertedPHIs. - NewPhiIndex = InsertedPHIs.size(); - for (auto &MPhi : NewInsertedPHIs) { - InsertedPHIs.push_back(&*MPhi); - FixupList.push_back(&*MPhi); - } + // Re-take the index where we're adding the new phis, because the above call + // to getPreviousDefFromEnd, may have inserted into InsertedPHIs. + NewPhiIndex = InsertedPHIs.size(); + for (auto &MPhi : NewInsertedPHIs) { + InsertedPHIs.push_back(&*MPhi); + FixupList.push_back(&*MPhi); } + FixupList.push_back(MD); } @@ -1082,7 +1085,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef Updates, for (; UI != E;) { Use &U = *UI; ++UI; - MemoryAccess *Usr = dyn_cast(U.getUser()); + MemoryAccess *Usr = cast(U.getUser()); if (MemoryPhi *UsrPhi = dyn_cast(Usr)) { BasicBlock *DominatedBlock = UsrPhi->getIncomingBlock(U); if (!DT.dominates(DominatingBlock, DominatedBlock)) diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 9c377552b4a360..bd1115f238b360 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1063,7 +1063,7 @@ static int getDecodedUnaryOpcode(unsigned Val, Type *Ty) { switch (Val) { default: return -1; - case bitc::UNOP_NEG: + case bitc::UNOP_FNEG: return IsFP ? Instruction::FNeg : -1; } } diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 049164c7be70f4..4da51dda8b7479 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -515,7 +515,7 @@ class MetadataLoader::MetadataLoaderImpl { GV.getMetadata(LLVMContext::MD_dbg, MDs); GV.eraseMetadata(LLVMContext::MD_dbg); for (auto *MD : MDs) - if (auto *DGV = dyn_cast_or_null(MD)) { + if (auto *DGV = dyn_cast(MD)) { auto *DGVE = DIGlobalVariableExpression::getDistinct( Context, DGV, DIExpression::get(Context, {})); GV.addMetadata(LLVMContext::MD_dbg, *DGVE); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 2818e843b50b65..deb4019ea8ba73 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -520,7 +520,7 @@ static unsigned getEncodedCastOpcode(unsigned Opcode) { static unsigned getEncodedUnaryOpcode(unsigned Opcode) { switch (Opcode) { default: llvm_unreachable("Unknown binary instruction!"); - case Instruction::FNeg: return bitc::UNOP_NEG; + case Instruction::FNeg: return bitc::UNOP_FNEG; } } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 4db0da1fcd3448..513361e1341789 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2476,6 +2476,7 @@ static void emitGlobalConstantStruct(const DataLayout &DL, } static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) { + assert(ET && "Unknown float type"); APInt API = APF.bitcastToAPInt(); // First print a comment with what we think the original floating-point value @@ -2483,11 +2484,7 @@ static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) { if (AP.isVerbose()) { SmallString<8> StrVal; APF.toString(StrVal); - - if (ET) - ET->print(AP.OutStreamer->GetCommentOS()); - else - AP.OutStreamer->GetCommentOS() << "Printing Type"; + ET->print(AP.OutStreamer->GetCommentOS()); AP.OutStreamer->GetCommentOS() << ' ' << StrVal << '\n'; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h index 789291771b5a19..a062baf7698a79 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h +++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h @@ -38,14 +38,10 @@ class DebugLocStream { : CU(CU), EntryOffset(EntryOffset) {} }; struct Entry { - const MCSymbol *BeginSym; - const MCSymbol *EndSym; + const MCSymbol *Begin; + const MCSymbol *End; size_t ByteOffset; size_t CommentOffset; - Entry(const MCSymbol *BeginSym, const MCSymbol *EndSym, size_t ByteOffset, - size_t CommentOffset) - : BeginSym(BeginSym), EndSym(EndSym), ByteOffset(ByteOffset), - CommentOffset(CommentOffset) {} }; private: @@ -93,7 +89,7 @@ class DebugLocStream { /// Until the next call, bytes added to the stream will be added to this /// entry. void startEntry(const MCSymbol *BeginSym, const MCSymbol *EndSym) { - Entries.emplace_back(BeginSym, EndSym, DWARFBytes.size(), Comments.size()); + Entries.push_back({BeginSym, EndSym, DWARFBytes.size(), Comments.size()}); } /// Finalize a .debug_loc entry, deleting if it's empty. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index c1c5c4f010c7ae..69c4d3fb5b4494 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -326,13 +326,13 @@ void DwarfCompileUnit::addRange(RangeSpan Range) { // emitted into and the subprogram was contained within. If these are the // same then extend our current range, otherwise add this as a new range. if (CURanges.empty() || !SameAsPrevCU || - (&CURanges.back().getEnd()->getSection() != - &Range.getEnd()->getSection())) { + (&CURanges.back().End->getSection() != + &Range.End->getSection())) { CURanges.push_back(Range); return; } - CURanges.back().setEnd(Range.getEnd()); + CURanges.back().End = Range.End; } void DwarfCompileUnit::initStmtList() { @@ -506,7 +506,7 @@ void DwarfCompileUnit::attachRangesOrLowHighPC( if (Ranges.size() == 1 || !DD->useRangesSection()) { const RangeSpan &Front = Ranges.front(); const RangeSpan &Back = Ranges.back(); - attachLowHighPC(Die, Front.getStart(), Back.getEnd()); + attachLowHighPC(Die, Front.Begin, Back.End); } else addScopeRangeList(Die, std::move(Ranges)); } @@ -516,8 +516,8 @@ void DwarfCompileUnit::attachRangesOrLowHighPC( SmallVector List; List.reserve(Ranges.size()); for (const InsnRange &R : Ranges) - List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first), - DD->getLabelAfterInsn(R.second))); + List.push_back( + {DD->getLabelBeforeInsn(R.first), DD->getLabelAfterInsn(R.second)}); attachRangesOrLowHighPC(Die, std::move(List)); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 8237404bf8e9fd..61a5445ff41140 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1054,7 +1054,7 @@ void DwarfDebug::finalizeModuleInfo() { // If we're splitting the dwarf out now that we've got the entire // CU then add the dwo id to it. auto *SkCU = TheCU.getSkeleton(); - if (useSplitDwarf() && !empty(TheCU.getUnitDie().children())) { + if (useSplitDwarf() && !TheCU.getUnitDie().children().empty()) { finishUnitAttributes(TheCU.getCUNode(), TheCU); TheCU.addString(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_name, Asm->TM.Options.MCOptions.SplitDwarfFile); @@ -1098,7 +1098,7 @@ void DwarfDebug::finalizeModuleInfo() { // 2.17.3). U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0); else - U.setBaseAddress(TheCU.getRanges().front().getStart()); + U.setBaseAddress(TheCU.getRanges().front().Begin); U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges()); } @@ -1106,7 +1106,7 @@ void DwarfDebug::finalizeModuleInfo() { // is a bit pessimistic under LTO. if (!AddrPool.isEmpty() && (getDwarfVersion() >= 5 || - (SkCU && !empty(TheCU.getUnitDie().children())))) + (SkCU && !TheCU.getUnitDie().children().empty()))) U.addAddrTableBase(); if (getDwarfVersion() >= 5) { @@ -1807,7 +1807,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { collectEntityInfo(TheCU, SP, Processed); // Add the range of this function to the list of ranges for the CU. - TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd())); + TheCU.addRange({Asm->getFunctionBegin(), Asm->getFunctionEnd()}); // Under -gmlt, skip building the subprogram if there are no inlined // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram @@ -2325,12 +2325,12 @@ void DwarfDebug::emitDebugLoc() { Asm->OutStreamer->AddComment("DW_LLE_offset_pair"); Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_offset_pair, 1); Asm->OutStreamer->AddComment(" starting offset"); - Asm->EmitLabelDifferenceAsULEB128(Entry.BeginSym, Base); + Asm->EmitLabelDifferenceAsULEB128(Entry.Begin, Base); Asm->OutStreamer->AddComment(" ending offset"); - Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Base); + Asm->EmitLabelDifferenceAsULEB128(Entry.End, Base); } else { - Asm->EmitLabelDifference(Entry.BeginSym, Base, Size); - Asm->EmitLabelDifference(Entry.EndSym, Base, Size); + Asm->EmitLabelDifference(Entry.Begin, Base, Size); + Asm->EmitLabelDifference(Entry.End, Base, Size); } emitDebugLocEntryLocation(Entry, CU); @@ -2346,12 +2346,12 @@ void DwarfDebug::emitDebugLoc() { Asm->OutStreamer->AddComment("DW_LLE_startx_length"); Asm->emitInt8(dwarf::DW_LLE_startx_length); Asm->OutStreamer->AddComment(" start idx"); - Asm->EmitULEB128(AddrPool.getIndex(Entry.BeginSym)); + Asm->EmitULEB128(AddrPool.getIndex(Entry.Begin)); Asm->OutStreamer->AddComment(" length"); - Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Entry.BeginSym); + Asm->EmitLabelDifferenceAsULEB128(Entry.End, Entry.Begin); } else { - Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size); - Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size); + Asm->OutStreamer->EmitSymbolValue(Entry.Begin, Size); + Asm->OutStreamer->EmitSymbolValue(Entry.End, Size); } emitDebugLocEntryLocation(Entry, CU); @@ -2386,9 +2386,9 @@ void DwarfDebug::emitDebugLocDWO() { // Ideally/in v5, this could use SectionLabels to reuse existing addresses // in the address pool to minimize object size/relocations. Asm->emitInt8(dwarf::DW_LLE_startx_length); - unsigned idx = AddrPool.getIndex(Entry.BeginSym); + unsigned idx = AddrPool.getIndex(Entry.Begin); Asm->EmitULEB128(idx); - Asm->EmitLabelDifference(Entry.EndSym, Entry.BeginSym, 4); + Asm->EmitLabelDifference(Entry.End, Entry.Begin, 4); emitDebugLocEntryLocation(Entry, List.CU); } @@ -2556,23 +2556,24 @@ void DwarfDebug::emitDebugARanges() { } } -/// Emit a single range list. We handle both DWARF v5 and earlier. -static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, - const RangeSpanList &List) { - +template +static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, MCSymbol *Sym, + const Ranges &R, const DwarfCompileUnit &CU, + unsigned BaseAddressx, unsigned OffsetPair, + unsigned StartxLength, unsigned EndOfList, + StringRef (*StringifyEnum)(unsigned)) { auto DwarfVersion = DD.getDwarfVersion(); // Emit our symbol so we can find the beginning of the range. - Asm->OutStreamer->EmitLabel(List.getSym()); + Asm->OutStreamer->EmitLabel(Sym); // Gather all the ranges that apply to the same section so they can share // a base address entry. MapVector> SectionRanges; // Size for our labels. auto Size = Asm->MAI->getCodePointerSize(); - for (const RangeSpan &Range : List.getRanges()) - SectionRanges[&Range.getStart()->getSection()].push_back(&Range); + for (const RangeSpan &Range : R) + SectionRanges[&Range.Begin->getSection()].push_back(&Range); - const DwarfCompileUnit &CU = List.getCU(); const MCSymbol *CUBase = CU.getBaseAddress(); bool BaseIsSet = false; for (const auto &P : SectionRanges) { @@ -2586,10 +2587,10 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, if (!Base && (P.second.size() > 1 || DwarfVersion < 5) && (CU.getCUNode()->getRangesBaseAddress() || DwarfVersion >= 5)) { BaseIsSet = true; - Base = DD.getSectionLabel(&P.second.front()->getStart()->getSection()); + Base = DD.getSectionLabel(&P.second.front()->Begin->getSection()); if (DwarfVersion >= 5) { - Asm->OutStreamer->AddComment("DW_RLE_base_addressx"); - Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1); + Asm->OutStreamer->AddComment(StringifyEnum(BaseAddressx)); + Asm->OutStreamer->EmitIntValue(BaseAddressx, 1); Asm->OutStreamer->AddComment(" base address index"); Asm->EmitULEB128(DD.getAddressPool().getIndex(Base)); } else { @@ -2605,15 +2606,15 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, } for (const auto *RS : P.second) { - const MCSymbol *Begin = RS->getStart(); - const MCSymbol *End = RS->getEnd(); + const MCSymbol *Begin = RS->Begin; + const MCSymbol *End = RS->End; assert(Begin && "Range without a begin symbol?"); assert(End && "Range without an end symbol?"); if (Base) { if (DwarfVersion >= 5) { // Emit DW_RLE_offset_pair when we have a base. - Asm->OutStreamer->AddComment("DW_RLE_offset_pair"); - Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_offset_pair, 1); + Asm->OutStreamer->AddComment(StringifyEnum(OffsetPair)); + Asm->emitInt8(OffsetPair); Asm->OutStreamer->AddComment(" starting offset"); Asm->EmitLabelDifferenceAsULEB128(Begin, Base); Asm->OutStreamer->AddComment(" ending offset"); @@ -2623,8 +2624,8 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, Asm->EmitLabelDifference(End, Base, Size); } } else if (DwarfVersion >= 5) { - Asm->OutStreamer->AddComment("DW_RLE_startx_length"); - Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_startx_length, 1); + Asm->OutStreamer->AddComment(StringifyEnum(StartxLength)); + Asm->emitInt8(StartxLength); Asm->OutStreamer->AddComment(" start index"); Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin)); Asm->OutStreamer->AddComment(" length"); @@ -2636,8 +2637,8 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, } } if (DwarfVersion >= 5) { - Asm->OutStreamer->AddComment("DW_RLE_end_of_list"); - Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_end_of_list, 1); + Asm->OutStreamer->AddComment(StringifyEnum(EndOfList)); + Asm->emitInt8(EndOfList); } else { // Terminate the list with two 0 values. Asm->OutStreamer->EmitIntValue(0, Size); @@ -2645,6 +2646,15 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, } } +/// Emit a single range list. We handle both DWARF v5 and earlier. +static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, + const RangeSpanList &List) { + emitRangeList(DD, Asm, List.getSym(), List.getRanges(), List.getCU(), + dwarf::DW_RLE_base_addressx, dwarf::DW_RLE_offset_pair, + dwarf::DW_RLE_startx_length, dwarf::DW_RLE_end_of_list, + llvm::dwarf::RangeListEncodingString); +} + static void emitDebugRangesImpl(DwarfDebug &DD, AsmPrinter *Asm, const DwarfFile &Holder, MCSymbol *TableEnd) { for (const RangeSpanList &List : Holder.getRangeLists()) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h index 244678ce9dc129..25ed8da970a409 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -32,15 +32,9 @@ class LexicalScope; class MCSection; // Data structure to hold a range for range lists. -class RangeSpan { -public: - RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {} - const MCSymbol *getStart() const { return Start; } - const MCSymbol *getEnd() const { return End; } - void setEnd(const MCSymbol *E) { End = E; } - -private: - const MCSymbol *Start, *End; +struct RangeSpan { + const MCSymbol *Begin; + const MCSymbol *End; }; class RangeSpanList { diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index 155e91ce61a1b2..0398675577cd5a 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -982,8 +982,7 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) { OS.EmitValueToAlignment(4); OS.EmitLabel(LSDALabel); - const Function *Per = - dyn_cast(F.getPersonalityFn()->stripPointerCasts()); + const auto *Per = cast(F.getPersonalityFn()->stripPointerCasts()); StringRef PerName = Per->getName(); int BaseState = -1; if (PerName == "_except_handler4") { diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index cc3379f13b4d69..27b298dcf6afd1 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -469,7 +469,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { Value *NewAddr = Builder.CreateBitCast(Addr, PT); StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); - NewSI->setAlignment(SI->getAlignment()); + NewSI->setAlignment(MaybeAlign(SI->getAlignment())); NewSI->setVolatile(SI->isVolatile()); NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 6c997a73023bca..b0d1599a5ebcf2 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1307,6 +1307,8 @@ static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) { /// result in infinite loops. static bool IsBetterFallthrough(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2) { + assert(MBB1 && MBB2 && "Unknown MachineBasicBlock"); + // Right now, we use a simple heuristic. If MBB2 ends with a call, and // MBB1 doesn't, we prefer to fall through into MBB1. This allows us to // optimize branches that branch to either a return block or an assert block diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt index 3cf0c60108efad..50b469d6d936ad 100644 --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -80,6 +80,7 @@ add_llvm_library(LLVMCodeGen MachineInstr.cpp MachineLICM.cpp MachineLoopInfo.cpp + MachineLoopUtils.cpp MachineModuleInfo.cpp MachineModuleInfoImpls.cpp MachineOperand.cpp diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp index 2ad35b3a72c98d..28143b30d4e8b7 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -79,5 +79,5 @@ bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI, return true; return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() && - !MI.hasUnmodeledSideEffects() && empty(MI.implicit_operands()); + !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty(); } diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index be3e58e1650295..684b99d8bae3f3 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2247,6 +2247,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { return lowerShuffleVector(MI); case G_DYN_STACKALLOC: return lowerDynStackAlloc(MI); + case G_EXTRACT: + return lowerExtract(MI); + case G_INSERT: + return lowerInsert(MI); } } @@ -4099,3 +4103,75 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { MI.eraseFromParent(); return Legalized; } + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerExtract(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + unsigned Offset = MI.getOperand(2).getImm(); + + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src); + + if (DstTy.isScalar() && + (SrcTy.isScalar() || + (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) { + LLT SrcIntTy = SrcTy; + if (!SrcTy.isScalar()) { + SrcIntTy = LLT::scalar(SrcTy.getSizeInBits()); + Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0); + } + + if (Offset == 0) + MIRBuilder.buildTrunc(Dst, Src); + else { + auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset); + auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt); + MIRBuilder.buildTrunc(Dst, Shr); + } + + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + +LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + Register InsertSrc = MI.getOperand(2).getReg(); + uint64_t Offset = MI.getOperand(3).getImm(); + + LLT DstTy = MRI.getType(Src); + LLT InsertTy = MRI.getType(InsertSrc); + + if (InsertTy.isScalar() && + (DstTy.isScalar() || + (DstTy.isVector() && DstTy.getElementType() == InsertTy))) { + LLT IntDstTy = DstTy; + if (!DstTy.isScalar()) { + IntDstTy = LLT::scalar(DstTy.getSizeInBits()); + Src = MIRBuilder.buildBitcast(IntDstTy, Src).getReg(0); + } + + Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0); + if (Offset != 0) { + auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset); + ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0); + } + + APInt MaskVal = ~APInt::getBitsSet(DstTy.getSizeInBits(), Offset, + InsertTy.getSizeInBits()); + + auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal); + auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask); + auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc); + + MIRBuilder.buildBitcast(Dst, Or); + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index ebe3b7c640cf14..70045512fae512 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -433,7 +433,7 @@ LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder( std::initializer_list Opcodes) { unsigned Representative = *Opcodes.begin(); - assert(!empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() && + assert(!llvm::empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() && "Initializer list must have at least two opcodes"); for (auto I = Opcodes.begin() + 1, E = Opcodes.end(); I != E; ++I) diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp index e69dc136096ea7..f0e35c65c53b87 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -139,7 +139,7 @@ bool RegBankSelect::repairReg( "need new vreg for each breakdown"); // An empty range of new register means no repairing. - assert(!empty(NewVRegs) && "We should not have to repair"); + assert(!NewVRegs.empty() && "We should not have to repair"); MachineInstr *MI; if (ValMapping.NumBreakDowns == 1) { @@ -687,8 +687,9 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) { // iterator before hand. MachineInstr &MI = *MII++; - // Ignore target-specific instructions: they should use proper regclasses. - if (isTargetSpecificOpcode(MI.getOpcode())) + // Ignore target-specific post-isel instructions: they should use proper + // regclasses. + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) continue; if (!assignInstr(MI)) { diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index 82eaa88abc78d2..3fcc55286bebb0 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -455,7 +455,7 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) { "This mapping is too complex for this function"); iterator_range::const_iterator> NewRegs = OpdMapper.getVRegs(OpIdx); - if (empty(NewRegs)) { + if (NewRegs.empty()) { LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n"); continue; } diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp index 0bdc62a345cbc2..f1b237d83e8cf8 100644 --- a/llvm/lib/CodeGen/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues.cpp @@ -219,8 +219,7 @@ class LiveDebugValues : public MachineFunctionPass { const ConstantInt *CImm; } Loc; - VarLoc(const MachineInstr &MI, LexicalScopes &LS, - VarLocKind K = InvalidKind) + VarLoc(const MachineInstr &MI, LexicalScopes &LS) : Var(MI), Expr(MI.getDebugExpression()), MI(MI), UVS(MI.getDebugLoc(), LS) { static_assert((sizeof(Loc) == sizeof(uint64_t)), @@ -244,18 +243,78 @@ class LiveDebugValues : public MachineFunctionPass { "entry values must be register locations"); } - /// The constructor for spill locations. - VarLoc(const MachineInstr &MI, unsigned SpillBase, int SpillOffset, - LexicalScopes &LS, const MachineInstr &OrigMI) - : Var(MI), Expr(MI.getDebugExpression()), MI(OrigMI), - UVS(MI.getDebugLoc(), LS) { - assert(MI.isDebugValue() && "not a DBG_VALUE"); - assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE"); - Kind = SpillLocKind; - Loc.SpillLocation = {SpillBase, SpillOffset}; + /// Take the variable and machine-location in DBG_VALUE MI, and build an + /// entry location using the given expression. + static VarLoc CreateEntryLoc(const MachineInstr &MI, LexicalScopes &LS, + const DIExpression *EntryExpr) { + VarLoc VL(MI, LS); + VL.Kind = EntryValueKind; + VL.Expr = EntryExpr; + return VL; + } + + /// Copy the register location in DBG_VALUE MI, updating the register to + /// be NewReg. + static VarLoc CreateCopyLoc(const MachineInstr &MI, LexicalScopes &LS, + unsigned NewReg) { + VarLoc VL(MI, LS); + assert(VL.Kind == RegisterKind); + VL.Loc.RegNo = NewReg; + return VL; + } + + /// Take the variable described by DBG_VALUE MI, and create a VarLoc + /// locating it in the specified spill location. + static VarLoc CreateSpillLoc(const MachineInstr &MI, unsigned SpillBase, + int SpillOffset, LexicalScopes &LS) { + VarLoc VL(MI, LS); + assert(VL.Kind == RegisterKind); + VL.Kind = SpillLocKind; + VL.Loc.SpillLocation = {SpillBase, SpillOffset}; + return VL; } - // Is the Loc field a constant or constant object? + /// Create a DBG_VALUE representing this VarLoc in the given function. + /// Copies variable-specific information such as DILocalVariable and + /// inlining information from the original DBG_VALUE instruction, which may + /// have been several transfers ago. + MachineInstr *BuildDbgValue(MachineFunction &MF) const { + const DebugLoc &DbgLoc = MI.getDebugLoc(); + bool Indirect = MI.isIndirectDebugValue(); + const auto &IID = MI.getDesc(); + const DILocalVariable *Var = MI.getDebugVariable(); + const DIExpression *DIExpr = MI.getDebugExpression(); + + switch (Kind) { + case EntryValueKind: + // An entry value is a register location -- but with an updated + // expression. + return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, Expr); + case RegisterKind: + // Register locations are like the source DBG_VALUE, but with the + // register number from this VarLoc. + return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, DIExpr); + case SpillLocKind: { + // Spills are indirect DBG_VALUEs, with a base register and offset. + // Use the original DBG_VALUEs expression to build the spilt location + // on top of. FIXME: spill locations created before this pass runs + // are not recognized, and not handled here. + auto *SpillExpr = DIExpression::prepend( + DIExpr, DIExpression::ApplyOffset, Loc.SpillLocation.SpillOffset); + unsigned Base = Loc.SpillLocation.SpillBase; + return BuildMI(MF, DbgLoc, IID, true, Base, Var, SpillExpr); + } + case ImmediateKind: { + MachineOperand MO = MI.getOperand(0); + return BuildMI(MF, DbgLoc, IID, Indirect, MO, Var, DIExpr); + } + case InvalidKind: + llvm_unreachable("Tried to produce DBG_VALUE for invalid VarLoc"); + } + llvm_unreachable("Unrecognized LiveDebugValues.VarLoc.Kind enum"); + } + + /// Is the Loc field a constant or constant object? bool isConstant() const { return Kind == ImmediateKind; } /// If this variable is described by a register, return it, @@ -271,7 +330,31 @@ class LiveDebugValues : public MachineFunctionPass { bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - LLVM_DUMP_METHOD void dump() const { MI.dump(); } + // TRI can be null. + void dump(const TargetRegisterInfo *TRI, raw_ostream &Out = dbgs()) const { + dbgs() << "VarLoc("; + switch (Kind) { + case RegisterKind: + case EntryValueKind: + dbgs() << printReg(Loc.RegNo, TRI); + break; + case SpillLocKind: + dbgs() << printReg(Loc.SpillLocation.SpillBase, TRI); + dbgs() << "[" << Loc.SpillLocation.SpillOffset << "]"; + break; + case ImmediateKind: + dbgs() << Loc.Immediate; + break; + case InvalidKind: + llvm_unreachable("Invalid VarLoc in dump method"); + } + + dbgs() << ", \"" << Var.getVar()->getName() << "\", " << *Expr << ", "; + if (Var.getInlinedAt()) + dbgs() << "!" << Var.getInlinedAt()->getMetadataID() << ")\n"; + else + dbgs() << "(null))\n"; + } #endif bool operator==(const VarLoc &Other) const { @@ -291,8 +374,8 @@ class LiveDebugValues : public MachineFunctionPass { using VarLocSet = SparseBitVector<>; using VarLocInMBB = SmallDenseMap; struct TransferDebugPair { - MachineInstr *TransferInst; - MachineInstr *DebugInst; + MachineInstr *TransferInst; /// Instruction where this transfer occurs. + unsigned LocationID; /// Location number for the transfer dest. }; using TransferMap = SmallVector; @@ -561,7 +644,7 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF, const VarLoc &VL = VarLocIDs[VLL]; Out << " Var: " << VL.Var.getVar()->getName(); Out << " MI: "; - VL.dump(); + VL.dump(TRI, Out); } } Out << "\n"; @@ -624,7 +707,6 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI, TransferMap &Transfers, DebugParamMap &DebugEntryVals, SparseBitVector<> &KillSet) { - MachineFunction *MF = MI.getParent()->getParent(); for (unsigned ID : KillSet) { if (!VarLocIDs[ID].Var.getVar()->isParameter()) continue; @@ -639,20 +721,12 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI, auto ParamDebugInstr = DebugEntryVals[CurrDebugInstr->getDebugVariable()]; DIExpression *NewExpr = DIExpression::prepend( ParamDebugInstr->getDebugExpression(), DIExpression::EntryValue); - MachineInstr *EntryValDbgMI = - BuildMI(*MF, ParamDebugInstr->getDebugLoc(), ParamDebugInstr->getDesc(), - ParamDebugInstr->isIndirectDebugValue(), - ParamDebugInstr->getOperand(0).getReg(), - ParamDebugInstr->getDebugVariable(), NewExpr); - - if (ParamDebugInstr->isIndirectDebugValue()) - EntryValDbgMI->getOperand(1).setImm( - ParamDebugInstr->getOperand(1).getImm()); - - Transfers.push_back({&MI, EntryValDbgMI}); - VarLoc VL(*EntryValDbgMI, LS); - unsigned EntryValLocID = VarLocIDs.insert(VL); - OpenRanges.insert(EntryValLocID, VL.Var); + + VarLoc EntryLoc = VarLoc::CreateEntryLoc(*ParamDebugInstr, LS, NewExpr); + + unsigned EntryValLocID = VarLocIDs.insert(EntryLoc); + Transfers.push_back({&MI, EntryValLocID}); + OpenRanges.insert(EntryValLocID, EntryLoc.Var); } } @@ -666,21 +740,19 @@ void LiveDebugValues::insertTransferDebugPair( VarLocMap &VarLocIDs, unsigned OldVarID, TransferKind Kind, unsigned NewReg) { const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI; - MachineFunction *MF = MI.getParent()->getParent(); - MachineInstr *NewDebugInstr; auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &DebugInstr, - &VarLocIDs](VarLoc &VL, MachineInstr *NewDebugInstr) { + &VarLocIDs](VarLoc &VL) { unsigned LocId = VarLocIDs.insert(VL); // Close this variable's previous location range. DebugVariable V(*DebugInstr); OpenRanges.erase(V); + // Record the new location as an open range, and a postponed transfer + // inserting a DBG_VALUE for this location. OpenRanges.insert(LocId, VL.Var); - // The newly created DBG_VALUE instruction NewDebugInstr must be inserted - // after MI. Keep track of the pairing. - TransferDebugPair MIP = {&MI, NewDebugInstr}; + TransferDebugPair MIP = {&MI, LocId}; Transfers.push_back(MIP); }; @@ -692,37 +764,25 @@ void LiveDebugValues::insertTransferDebugPair( "No register supplied when handling a copy of a debug value"); // Create a DBG_VALUE instruction to describe the Var in its new // register location. - NewDebugInstr = BuildMI( - *MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), - DebugInstr->isIndirectDebugValue(), NewReg, - DebugInstr->getDebugVariable(), DebugInstr->getDebugExpression()); - if (DebugInstr->isIndirectDebugValue()) - NewDebugInstr->getOperand(1).setImm(DebugInstr->getOperand(1).getImm()); - VarLoc VL(*NewDebugInstr, LS); - ProcessVarLoc(VL, NewDebugInstr); - LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register copy: "; - NewDebugInstr->print(dbgs(), /*IsStandalone*/false, - /*SkipOpers*/false, /*SkipDebugLoc*/false, - /*AddNewLine*/true, TII)); + VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg); + ProcessVarLoc(VL); + LLVM_DEBUG({ + dbgs() << "Creating VarLoc for register copy:"; + VL.dump(TRI); + }); return; } case TransferKind::TransferSpill: { // Create a DBG_VALUE instruction to describe the Var in its spilled // location. VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI); - auto *SpillExpr = DIExpression::prepend(DebugInstr->getDebugExpression(), - DIExpression::ApplyOffset, - SpillLocation.SpillOffset); - NewDebugInstr = BuildMI( - *MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), true, - SpillLocation.SpillBase, DebugInstr->getDebugVariable(), SpillExpr); - VarLoc VL(*NewDebugInstr, SpillLocation.SpillBase, - SpillLocation.SpillOffset, LS, *DebugInstr); - ProcessVarLoc(VL, NewDebugInstr); - LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: "; - NewDebugInstr->print(dbgs(), /*IsStandalone*/false, - /*SkipOpers*/false, /*SkipDebugLoc*/false, - /*AddNewLine*/true, TII)); + VarLoc VL = VarLoc::CreateSpillLoc(*DebugInstr, SpillLocation.SpillBase, + SpillLocation.SpillOffset, LS); + ProcessVarLoc(VL); + LLVM_DEBUG({ + dbgs() << "Creating VarLoc for spill:"; + VL.dump(TRI); + }); return; } case TransferKind::TransferRestore: { @@ -732,15 +792,12 @@ void LiveDebugValues::insertTransferDebugPair( DIBuilder DIB(*const_cast(MF->getFunction()).getParent()); // DebugInstr refers to the pre-spill location, therefore we can reuse // its expression. - NewDebugInstr = BuildMI( - *MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), false, NewReg, - DebugInstr->getDebugVariable(), DebugInstr->getDebugExpression()); - VarLoc VL(*NewDebugInstr, LS); - ProcessVarLoc(VL, NewDebugInstr); - LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register restore: "; - NewDebugInstr->print(dbgs(), /*IsStandalone*/false, - /*SkipOpers*/false, /*SkipDebugLoc*/false, - /*AddNewLine*/true, TII)); + VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg); + ProcessVarLoc(VL); + LLVM_DEBUG({ + dbgs() << "Creating VarLoc for restore:"; + VL.dump(TRI); + }); return; } } @@ -895,11 +952,9 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI, // At this stage, we already know which DBG_VALUEs are for spills and // where they are located; it's best to fix handle overwrites now. KillSet.set(ID); - MachineInstr *NewDebugInstr = - BuildMI(*MF, VL.MI.getDebugLoc(), VL.MI.getDesc(), - VL.MI.isIndirectDebugValue(), 0, // $noreg - VL.MI.getDebugVariable(), VL.MI.getDebugExpression()); - Transfers.push_back({&MI, NewDebugInstr}); + VarLoc UndefVL = VarLoc::CreateCopyLoc(VL.MI, LS, 0); + unsigned UndefLocID = VarLocIDs.insert(UndefVL); + Transfers.push_back({&MI, UndefLocID}); } } OpenRanges.erase(KillSet, VarLocIDs); @@ -986,14 +1041,11 @@ bool LiveDebugValues::transferTerminator(MachineBasicBlock *CurMBB, const VarLocMap &VarLocIDs) { bool Changed = false; - if (OpenRanges.empty()) - return false; - LLVM_DEBUG(for (unsigned ID : OpenRanges.getVarLocs()) { // Copy OpenRanges to OutLocs, if not already present. dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ": "; - VarLocIDs[ID].dump(); + VarLocIDs[ID].dump(TRI); }); VarLocSet &VLS = OutLocs[CurMBB]; Changed = VLS != OpenRanges.getVarLocs(); @@ -1195,35 +1247,9 @@ void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs, // The ID location is live-in to MBB -- work out what kind of machine // location it is and create a DBG_VALUE. const VarLoc &DiffIt = VarLocIDs[ID]; - const MachineInstr *DebugInstr = &DiffIt.MI; - MachineInstr *MI = nullptr; - - if (DiffIt.isConstant()) { - MachineOperand MO(DebugInstr->getOperand(0)); - MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(), - DebugInstr->getDesc(), false, MO, - DebugInstr->getDebugVariable(), - DebugInstr->getDebugExpression()); - } else { - auto *DebugExpr = DebugInstr->getDebugExpression(); - Register Reg = DebugInstr->getOperand(0).getReg(); - bool IsIndirect = DebugInstr->isIndirectDebugValue(); - - if (DiffIt.Kind == VarLoc::SpillLocKind) { - // This is is a spilt location; DebugInstr refers to the unspilt - // location. We need to rebuild the spilt location expression and - // point the DBG_VALUE at the frame register. - DebugExpr = DIExpression::prepend( - DebugInstr->getDebugExpression(), DIExpression::ApplyOffset, - DiffIt.Loc.SpillLocation.SpillOffset); - Reg = TRI->getFrameRegister(*DebugInstr->getMF()); - IsIndirect = true; - } + MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent()); + MBB.insert(MBB.instr_begin(), MI); - MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(), - DebugInstr->getDesc(), IsIndirect, Reg, - DebugInstr->getDebugVariable(), DebugExpr); - } (void)MI; LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump();); } @@ -1362,11 +1388,6 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { DebugEntryVals, OverlapFragments, SeenFragments); OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs); - // Add any DBG_VALUE instructions necessitated by spills. - for (auto &TR : Transfers) - MBB->insertAfterBundle(TR.TransferInst->getIterator(), TR.DebugInst); - Transfers.clear(); - LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after propagating", dbgs())); LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, @@ -1387,6 +1408,15 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { assert(Pending.empty() && "Pending should be empty"); } + // Add any DBG_VALUE instructions created by location transfers. + for (auto &TR : Transfers) { + MachineBasicBlock *MBB = TR.TransferInst->getParent(); + const VarLoc &VL = VarLocIDs[TR.LocationID]; + MachineInstr *MI = VL.BuildDbgValue(MF); + MBB->insertAfterBundle(TR.TransferInst->getIterator(), MI); + } + Transfers.clear(); + // Deferred inlocs will not have had any DBG_VALUE insts created; do // that now. flushPendingLocs(PendingInLocs, VarLocIDs); diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 27438ecf0adc86..ac19bc0bd8ea28 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -38,7 +38,6 @@ #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" @@ -2713,7 +2712,6 @@ void MachineBlockPlacement::optimizeBranches() { // cannot because all branches may not be analyzable. // E.g., the target may be able to remove an unconditional branch to // a fallthrough when it occurs after predicated terminators. - SmallVector EmptyBB; for (MachineBasicBlock *ChainBB : FunctionChain) { Cond.clear(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. @@ -2733,50 +2731,9 @@ void MachineBlockPlacement::optimizeBranches() { TII->removeBranch(*ChainBB); TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl); ChainBB->updateTerminator(); - } else if (Cond.empty() && TBB && ChainBB != TBB && !TBB->empty() && - !TBB->canFallThrough()) { - // When ChainBB is unconditional branch to the TBB, and TBB has no - // fallthrough predecessor and fallthrough successor, try to merge - // ChainBB and TBB. This is legal under the one of following conditions: - // 1. ChainBB is empty except for an unconditional branch. - // 2. TBB has only one predecessor. - MachineFunction::iterator I(TBB); - if (((TBB == &*F->begin()) || !std::prev(I)->canFallThrough()) && - (TailDup.isSimpleBB(ChainBB) || (TBB->pred_size() == 1))) { - TII->removeBranch(*ChainBB); - ChainBB->removeSuccessor(TBB); - - // Update the CFG. - while (!TBB->pred_empty()) { - MachineBasicBlock *Pred = *(TBB->pred_end() - 1); - Pred->ReplaceUsesOfBlockWith(TBB, ChainBB); - } - - while (!TBB->succ_empty()) { - MachineBasicBlock *Succ = *(TBB->succ_end() - 1); - ChainBB->addSuccessor(Succ, MBPI->getEdgeProbability(TBB, Succ)); - TBB->removeSuccessor(Succ); - } - - // Move all the instructions of TBB to ChainBB. - ChainBB->splice(ChainBB->end(), TBB, TBB->begin(), TBB->end()); - EmptyBB.push_back(TBB); - - // If TBB was the target of a jump table, update jump tables to go to - // the ChainBB instead. - if (MachineJumpTableInfo *MJTI = F->getJumpTableInfo()) - MJTI->ReplaceMBBInJumpTables(TBB, ChainBB); - } } } } - - for (auto BB: EmptyBB) { - MLI->removeBlock(BB); - FunctionChain.remove(BB); - BlockToChain.erase(BB); - F->erase(BB); - } } void MachineBlockPlacement::alignBlocks() { @@ -3097,9 +3054,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { } } - // optimizeBranches() may change the blocks, but we haven't updated the - // post-dominator tree. Because the post-dominator tree won't be used after - // this function and this pass don't preserve the post-dominator tree. optimizeBranches(); alignBlocks(); diff --git a/llvm/lib/CodeGen/MachineLoopUtils.cpp b/llvm/lib/CodeGen/MachineLoopUtils.cpp new file mode 100644 index 00000000000000..e074b76082f0e2 --- /dev/null +++ b/llvm/lib/CodeGen/MachineLoopUtils.cpp @@ -0,0 +1,132 @@ +//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineLoopUtils.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +using namespace llvm; + +namespace { +// MI's parent and BB are clones of each other. Find the equivalent copy of MI +// in BB. +MachineInstr &findEquivalentInstruction(MachineInstr &MI, + MachineBasicBlock *BB) { + MachineBasicBlock *PB = MI.getParent(); + unsigned Offset = std::distance(PB->instr_begin(), MachineBasicBlock::instr_iterator(MI)); + return *std::next(BB->instr_begin(), Offset); +} +} // namespace + +MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction, + MachineBasicBlock *Loop, + MachineRegisterInfo &MRI, + const TargetInstrInfo *TII) { + MachineFunction &MF = *Loop->getParent(); + MachineBasicBlock *Preheader = *Loop->pred_begin(); + if (Preheader == Loop) + Preheader = *std::next(Loop->pred_begin()); + MachineBasicBlock *Exit = *Loop->succ_begin(); + if (Exit == Loop) + Exit = *std::next(Loop->succ_begin()); + + MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock()); + if (Direction == LPD_Front) + MF.insert(Loop->getIterator(), NewBB); + else + MF.insert(std::next(Loop->getIterator()), NewBB); + + // FIXME: Add DenseMapInfo trait for Register so we can use it as a key. + DenseMap Remaps; + auto InsertPt = NewBB->end(); + for (MachineInstr &MI : *Loop) { + MachineInstr *NewMI = MF.CloneMachineInstr(&MI); + NewBB->insert(InsertPt, NewMI); + for (MachineOperand &MO : NewMI->defs()) { + Register OrigR = MO.getReg(); + if (OrigR.isPhysical()) + continue; + Register &R = Remaps[OrigR]; + R = MRI.createVirtualRegister(MRI.getRegClass(OrigR)); + MO.setReg(R); + + if (Direction == LPD_Back) { + // Replace all uses outside the original loop with the new register. + // FIXME: is the use_iterator stable enough to mutate register uses + // while iterating? + SmallVector Uses; + for (auto &Use : MRI.use_operands(OrigR)) + if (Use.getParent()->getParent() != Loop) + Uses.push_back(&Use); + for (auto *Use : Uses) { + MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg())); + Use->setReg(R); + } + } + } + } + + for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I) + for (MachineOperand &MO : I->uses()) + if (MO.isReg() && Remaps.count(MO.getReg())) + MO.setReg(Remaps[MO.getReg()]); + + for (auto I = NewBB->begin(); I->isPHI(); ++I) { + MachineInstr &MI = *I; + unsigned LoopRegIdx = 3, InitRegIdx = 1; + if (MI.getOperand(2).getMBB() != Preheader) + std::swap(LoopRegIdx, InitRegIdx); + MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop); + assert(OrigPhi.isPHI()); + if (Direction == LPD_Front) { + // When peeling front, we are only left with the initial value from the + // preheader. + Register R = MI.getOperand(LoopRegIdx).getReg(); + if (Remaps.count(R)) + R = Remaps[R]; + OrigPhi.getOperand(InitRegIdx).setReg(R); + MI.RemoveOperand(LoopRegIdx + 1); + MI.RemoveOperand(LoopRegIdx + 0); + } else { + // When peeling back, the initial value is the loop-carried value from + // the original loop. + Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg(); + MI.getOperand(LoopRegIdx).setReg(LoopReg); + MI.RemoveOperand(InitRegIdx + 1); + MI.RemoveOperand(InitRegIdx + 0); + } + } + + DebugLoc DL; + if (Direction == LPD_Front) { + Preheader->replaceSuccessor(Loop, NewBB); + NewBB->addSuccessor(Loop); + Loop->replacePhiUsesWith(Preheader, NewBB); + if (TII->removeBranch(*Preheader) > 0) + TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL); + TII->removeBranch(*NewBB); + TII->insertBranch(*NewBB, Loop, nullptr, {}, DL); + } else { + Loop->replaceSuccessor(Exit, NewBB); + Exit->replacePhiUsesWith(Loop, NewBB); + NewBB->addSuccessor(Exit); + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + bool CanAnalyzeBr = !TII->analyzeBranch(*Loop, TBB, FBB, Cond); + (void)CanAnalyzeBr; + assert(CanAnalyzeBr && "Must be able to analyze the loop branch!"); + TII->removeBranch(*Loop); + TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB, + FBB == Exit ? NewBB : FBB, Cond, DL); + if (TII->removeBranch(*NewBB) > 0) + TII->insertBranch(*NewBB, Exit, nullptr, {}, DL); + } + + return NewBB; +} diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp index 50a9251780e694..e0b4e9cac229e7 100644 --- a/llvm/lib/CodeGen/MachineModuleInfo.cpp +++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp @@ -346,7 +346,7 @@ char MachineModuleInfoWrapperPass::ID = 0; bool MachineModuleInfoWrapperPass::doInitialization(Module &M) { MMI.initialize(); MMI.TheModule = &M; - MMI.DbgInfoAvailable = !empty(M.debug_compile_units()); + MMI.DbgInfoAvailable = !M.debug_compile_units().empty(); return false; } @@ -361,6 +361,6 @@ MachineModuleInfo MachineModuleAnalysis::run(Module &M, ModuleAnalysisManager &) { MachineModuleInfo MMI(TM); MMI.TheModule = &M; - MMI.DbgInfoAvailable = !empty(M.debug_compile_units()); + MMI.DbgInfoAvailable = !M.debug_compile_units().empty(); return MMI; } diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 5ec891b7aa0cae..8eccfb85a94618 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1065,17 +1065,6 @@ uint64_t MachineMemOperand::getAlignment() const { return MinAlign(getBaseAlignment(), getOffset()); } -void MachineMemOperand::print(raw_ostream &OS) const { - ModuleSlotTracker DummyMST(nullptr); - print(OS, DummyMST); -} - -void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const { - SmallVector SSNs; - LLVMContext Ctx; - print(OS, MST, SSNs, Ctx, nullptr, nullptr); -} - void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, SmallVectorImpl &SSNs, const LLVMContext &Context, diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index 533cce57adc844..60eeefba9d6fef 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -1303,6 +1303,12 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M, if (F.empty()) continue; + // Disable outlining from noreturn functions right now. Noreturn requires + // special handling for the case where what we are outlining could be a + // tail call. + if (F.hasFnAttribute(Attribute::NoReturn)) + continue; + // There's something in F. Check if it has a MachineFunction associated with // it. MachineFunction *MF = MMI.getMachineFunction(F); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 9591211fd9ea4a..89c9f6093a975d 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -557,10 +557,7 @@ void SwingSchedulerDAG::schedule() { // The experimental code generator can't work if there are InstChanges. if (ExperimentalCodeGen && NewInstrChanges.empty()) { PeelingModuloScheduleExpander MSE(MF, MS, &LIS); - // Experimental code generation isn't complete yet, but it can partially - // validate the code it generates against the original - // ModuloScheduleExpander. - MSE.validateAgainstModuloScheduleExpander(); + MSE.expand(); } else { ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges)); MSE.expand(); diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index a68153cf3b6f29..7ce3c5861801a5 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MCContext.h" @@ -1313,7 +1314,7 @@ void KernelRewriter::rewrite() { // Now remap every instruction in the loop. for (MachineInstr &MI : *BB) { - if (MI.isPHI()) + if (MI.isPHI() || MI.isTerminator()) continue; for (MachineOperand &MO : MI.uses()) { if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit()) @@ -1564,6 +1565,265 @@ class KernelOperandInfo { }; } // namespace +MachineBasicBlock * +PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) { + MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII); + if (LPD == LPD_Front) + PeeledFront.push_back(NewBB); + else + PeeledBack.push_front(NewBB); + for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator(); + ++I, ++NI) { + CanonicalMIs[&*I] = &*I; + CanonicalMIs[&*NI] = &*I; + BlockMIs[{NewBB, &*I}] = &*NI; + BlockMIs[{BB, &*I}] = &*I; + } + return NewBB; +} + +void PeelingModuloScheduleExpander::peelPrologAndEpilogs() { + BitVector LS(Schedule.getNumStages(), true); + BitVector AS(Schedule.getNumStages(), true); + LiveStages[BB] = LS; + AvailableStages[BB] = AS; + + // Peel out the prologs. + LS.reset(); + for (int I = 0; I < Schedule.getNumStages() - 1; ++I) { + LS[I] = 1; + Prologs.push_back(peelKernel(LPD_Front)); + LiveStages[Prologs.back()] = LS; + AvailableStages[Prologs.back()] = LS; + } + + // Create a block that will end up as the new loop exiting block (dominated by + // all prologs and epilogs). It will only contain PHIs, in the same order as + // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property + // that the exiting block is a (sub) clone of BB. This in turn gives us the + // property that any value deffed in BB but used outside of BB is used by a + // PHI in the exiting block. + MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock(); + + // Push out the epilogs, again in reverse order. + // We can't assume anything about the minumum loop trip count at this point, + // so emit a fairly complex epilog: + // K[0, 1, 2] // Kernel runs stages 0, 1, 2 + // E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0]. + // E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2. + // + // This creates a single-successor single-predecessor sequence of blocks for + // each epilog, which are kept this way for simplicity at this stage and + // cleaned up by the optimizer later. + for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) { + Epilogs.push_back(nullptr); + for (int J = Schedule.getNumStages() - 1; J >= I; --J) { + LS.reset(); + LS[J] = 1; + Epilogs.back() = peelKernel(LPD_Back); + LiveStages[Epilogs.back()] = LS; + AvailableStages[Epilogs.back()] = AS; + } + } + + // Now we've defined all the prolog and epilog blocks as a fallthrough + // sequence, add the edges that will be followed if the loop trip count is + // lower than the number of stages (connecting prologs directly with epilogs). + auto PI = Prologs.begin(); + auto EI = Epilogs.begin(); + assert(Prologs.size() == Epilogs.size()); + for (; PI != Prologs.end(); ++PI, ++EI) { + MachineBasicBlock *Pred = *(*EI)->pred_begin(); + (*PI)->addSuccessor(*EI); + for (MachineInstr &MI : (*EI)->phis()) { + Register Reg = MI.getOperand(1).getReg(); + MachineInstr *Use = MRI.getUniqueVRegDef(Reg); + if (Use && Use->getParent() == Pred) + Reg = getEquivalentRegisterIn(Reg, *PI); + MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false)); + MI.addOperand(MachineOperand::CreateMBB(*PI)); + } + } + + // Create a list of all blocks in order. + SmallVector Blocks; + llvm::copy(PeeledFront, std::back_inserter(Blocks)); + Blocks.push_back(BB); + llvm::copy(PeeledBack, std::back_inserter(Blocks)); + + // Iterate in reverse order over all instructions, remapping as we go. + for (MachineBasicBlock *B : reverse(Blocks)) { + for (auto I = B->getFirstInstrTerminator()->getReverseIterator(); + I != std::next(B->getFirstNonPHI()->getReverseIterator());) { + MachineInstr *MI = &*I++; + rewriteUsesOf(MI); + } + } + // Now all remapping has been done, we're free to optimize the generated code. + for (MachineBasicBlock *B : reverse(Blocks)) + EliminateDeadPhis(B, MRI, LIS); + EliminateDeadPhis(ExitingBB, MRI, LIS); +} + +MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() { + MachineFunction &MF = *BB->getParent(); + MachineBasicBlock *Exit = *BB->succ_begin(); + if (Exit == BB) + Exit = *std::next(BB->succ_begin()); + + MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock()); + MF.insert(std::next(BB->getIterator()), NewBB); + + // Clone all phis in BB into NewBB and rewrite. + for (MachineInstr &MI : BB->phis()) { + auto RC = MRI.getRegClass(MI.getOperand(0).getReg()); + Register OldR = MI.getOperand(3).getReg(); + Register R = MRI.createVirtualRegister(RC); + SmallVector Uses; + for (MachineInstr &Use : MRI.use_instructions(OldR)) + if (Use.getParent() != BB) + Uses.push_back(&Use); + for (MachineInstr *Use : Uses) + Use->substituteRegister(OldR, R, /*SubIdx=*/0, + *MRI.getTargetRegisterInfo()); + MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R) + .addReg(OldR) + .addMBB(BB); + BlockMIs[{NewBB, &MI}] = NI; + CanonicalMIs[NI] = &MI; + } + BB->replaceSuccessor(Exit, NewBB); + Exit->replacePhiUsesWith(BB, NewBB); + NewBB->addSuccessor(Exit); + + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond); + (void)CanAnalyzeBr; + assert(CanAnalyzeBr && "Must be able to analyze the loop branch!"); + TII->removeBranch(*BB); + TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB, + Cond, DebugLoc()); + TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc()); + return NewBB; +} + +Register +PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg, + MachineBasicBlock *BB) { + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg); + return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg(); +} + +void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) { + if (MI->isPHI()) { + // This is an illegal PHI. The loop-carried (desired) value is operand 3, + // and it is produced by this block. + Register PhiR = MI->getOperand(0).getReg(); + Register R = MI->getOperand(3).getReg(); + int RMIStage = getStage(MRI.getUniqueVRegDef(R)); + if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage)) + R = MI->getOperand(1).getReg(); + MRI.setRegClass(R, MRI.getRegClass(PhiR)); + MRI.replaceRegWith(PhiR, R); + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); + return; + } + + int Stage = getStage(MI); + if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 || + LiveStages[MI->getParent()].test(Stage)) + // Instruction is live, no rewriting to do. + return; + + for (MachineOperand &DefMO : MI->defs()) { + SmallVector, 4> Subs; + for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) { + // Only PHIs can use values from this block by construction. + // Match with the equivalent PHI in B. + assert(UseMI.isPHI()); + Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(), + MI->getParent()); + Subs.emplace_back(&UseMI, Reg); + } + for (auto &Sub : Subs) + Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0, + *MRI.getTargetRegisterInfo()); + } + if (LIS) + LIS->RemoveMachineInstrFromMaps(*MI); + MI->eraseFromParent(); +} + +void PeelingModuloScheduleExpander::fixupBranches() { + std::unique_ptr Info = + TII->analyzeLoopForPipelining(BB); + assert(Info); + + // Work outwards from the kernel. + bool KernelDisposed = false; + int TC = Schedule.getNumStages() - 1; + for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend(); + ++PI, ++EI, --TC) { + MachineBasicBlock *Prolog = *PI; + MachineBasicBlock *Fallthrough = *Prolog->succ_begin(); + MachineBasicBlock *Epilog = *EI; + SmallVector Cond; + TII->removeBranch(*Prolog); + Optional StaticallyGreater = + Info->createTripCountGreaterCondition(TC, *Prolog, Cond); + if (!StaticallyGreater.hasValue()) { + LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n"); + // Dynamically branch based on Cond. + TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc()); + } else if (*StaticallyGreater == false) { + LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n"); + // Prolog never falls through; branch to epilog and orphan interior + // blocks. Leave it to unreachable-block-elim to clean up. + Prolog->removeSuccessor(Fallthrough); + for (MachineInstr &P : Fallthrough->phis()) { + P.RemoveOperand(2); + P.RemoveOperand(1); + } + TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc()); + KernelDisposed = true; + } else { + LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n"); + // Prolog always falls through; remove incoming values in epilog. + Prolog->removeSuccessor(Epilog); + for (MachineInstr &P : Epilog->phis()) { + P.RemoveOperand(4); + P.RemoveOperand(3); + } + } + } + + if (!KernelDisposed) { + Info->adjustTripCount(-(Schedule.getNumStages() - 1)); + Info->setPreheader(Prologs.back()); + } else { + Info->disposed(); + } +} + +void PeelingModuloScheduleExpander::rewriteKernel() { + KernelRewriter KR(*Schedule.getLoop(), Schedule); + KR.rewrite(); +} + +void PeelingModuloScheduleExpander::expand() { + BB = Schedule.getLoop()->getTopBlock(); + Preheader = Schedule.getLoop()->getLoopPreheader(); + LLVM_DEBUG(Schedule.dump()); + + rewriteKernel(); + peelPrologAndEpilogs(); + fixupBranches(); +} + void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() { BB = Schedule.getLoop()->getTopBlock(); Preheader = Schedule.getLoop()->getLoopPreheader(); @@ -1593,6 +1853,7 @@ void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() { // Now run the new expansion algorithm. KernelRewriter KR(*Schedule.getLoop(), Schedule); KR.rewrite(); + peelPrologAndEpilogs(); // Collect all illegal phis that the new algorithm created. We'll give these // to KernelOperandInfo. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 93248cb078cd6b..7ea908437ff918 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -761,6 +761,11 @@ CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); } +bool TargetLowering::DAGCombinerInfo:: +recursivelyDeleteUnusedNodes(SDNode *N) { + return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N); +} + void TargetLowering::DAGCombinerInfo:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); @@ -19911,8 +19916,13 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, // when the condition can be materialized as an all-ones register. Any // single bit-test can be materialized as an all-ones register with // shift-left and shift-right-arith. + // TODO: The operation legality checks could be loosened to include "custom", + // but that may cause regressions for targets that do not have shift + // instructions. if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && - N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { + N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2) && + TLI.isOperationLegal(ISD::SHL, VT) && + TLI.isOperationLegal(ISD::SRA, VT)) { SDValue AndLHS = N0->getOperand(0); auto *ConstAndRHS = dyn_cast(N0->getOperand(1)); if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 8904283ff8daef..0fd2bd7815ba14 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1303,6 +1303,7 @@ bool FastISel::selectCall(const User *I) { ExtraInfo |= InlineAsm::Extra_HasSideEffects; if (IA->isAlignStack()) ExtraInfo |= InlineAsm::Extra_IsAlignStack; + ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::INLINEASM)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 0efcaaa19cbde9..f40565c5fd12c0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1103,6 +1103,16 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { return; } break; + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + // These pseudo-ops are the same as the other STRICT_ ops except + // they are registered with setOperationAction() using the input type + // instead of the output type. + Action = TLI.getStrictFPOperationAction(Node->getOpcode(), + Node->getOperand(1).getValueType()); + break; case ISD::SADDSAT: case ISD::UADDSAT: case ISD::SSUBSAT: @@ -2141,6 +2151,9 @@ SDValue SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node, RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128, RTLIB::Libcall Call_PPCF128) { + if (Node->isStrictFPOpcode()) + Node = DAG.mutateStrictFPToFP(Node); + RTLIB::Libcall LC; switch (Node->getOperand(0).getValueType().getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); @@ -2895,30 +2908,6 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { return true; } break; - case ISD::LROUND: - Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LROUND_F32, - RTLIB::LROUND_F64, RTLIB::LROUND_F80, - RTLIB::LROUND_F128, - RTLIB::LROUND_PPCF128)); - break; - case ISD::LLROUND: - Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32, - RTLIB::LLROUND_F64, RTLIB::LLROUND_F80, - RTLIB::LLROUND_F128, - RTLIB::LLROUND_PPCF128)); - break; - case ISD::LRINT: - Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LRINT_F32, - RTLIB::LRINT_F64, RTLIB::LRINT_F80, - RTLIB::LRINT_F128, - RTLIB::LRINT_PPCF128)); - break; - case ISD::LLRINT: - Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32, - RTLIB::LLRINT_F64, RTLIB::LLRINT_F80, - RTLIB::LLRINT_F128, - RTLIB::LLRINT_PPCF128)); - break; case ISD::VAARG: Results.push_back(DAG.expandVAArg(Node)); Results.push_back(Results[0].getValue(1)); @@ -3712,10 +3701,25 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // the "strict" properties. For now, we just fall back to the non-strict // version if that is legal on the target. The actual mutation of the // operation will happen in SelectionDAGISel::DoInstructionSelection. - if (TLI.getStrictFPOperationAction(Node->getOpcode(), - Node->getValueType(0)) - == TargetLowering::Legal) - return true; + switch (Node->getOpcode()) { + default: + if (TLI.getStrictFPOperationAction(Node->getOpcode(), + Node->getValueType(0)) + == TargetLowering::Legal) + return true; + break; + case ISD::STRICT_LRINT: + case ISD::STRICT_LLRINT: + case ISD::STRICT_LROUND: + case ISD::STRICT_LLROUND: + // These are registered by the operand type instead of the value + // type. Reflect that here. + if (TLI.getStrictFPOperationAction(Node->getOpcode(), + Node->getOperand(1).getValueType()) + == TargetLowering::Legal) + return true; + break; + } } // Replace the original node with the legalized result. @@ -3959,6 +3963,34 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::POW_F80, RTLIB::POW_F128, RTLIB::POW_PPCF128)); break; + case ISD::LROUND: + case ISD::STRICT_LROUND: + Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LROUND_F32, + RTLIB::LROUND_F64, RTLIB::LROUND_F80, + RTLIB::LROUND_F128, + RTLIB::LROUND_PPCF128)); + break; + case ISD::LLROUND: + case ISD::STRICT_LLROUND: + Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32, + RTLIB::LLROUND_F64, RTLIB::LLROUND_F80, + RTLIB::LLROUND_F128, + RTLIB::LLROUND_PPCF128)); + break; + case ISD::LRINT: + case ISD::STRICT_LRINT: + Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LRINT_F32, + RTLIB::LRINT_F64, RTLIB::LRINT_F80, + RTLIB::LRINT_F128, + RTLIB::LRINT_PPCF128)); + break; + case ISD::LLRINT: + case ISD::STRICT_LLRINT: + Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32, + RTLIB::LLRINT_F64, RTLIB::LLRINT_F80, + RTLIB::LLRINT_F128, + RTLIB::LLRINT_PPCF128)); + break; case ISD::FDIV: Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64, RTLIB::DIV_F80, RTLIB::DIV_F128, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index eadc388fc9d5c4..5562f400b6e1da 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -514,16 +514,25 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { if (Cond.getValueType().isVector()) { if (SDValue Res = WidenVSELECTAndMask(N)) std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl); - // It seems to improve code to generate two narrow SETCCs as opposed to - // splitting a wide result vector. - else if (Cond.getOpcode() == ISD::SETCC) - SplitVecRes_SETCC(Cond.getNode(), CL, CH); // Check if there are already splitted versions of the vector available and // use those instead of splitting the mask operand again. else if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector) GetSplitVector(Cond, CL, CH); - else + // It seems to improve code to generate two narrow SETCCs as opposed to + // splitting a wide result vector. + else if (Cond.getOpcode() == ISD::SETCC) { + // If the condition is a vXi1 vector, and the LHS of the setcc is a legal + // type and the setcc result type is the same vXi1, then leave the setcc + // alone. + EVT CondLHSVT = Cond.getOperand(0).getValueType(); + if (Cond.getValueType().getVectorElementType() == MVT::i1 && + isTypeLegal(CondLHSVT) && + getSetCCResultType(CondLHSVT) == Cond.getValueType()) + std::tie(CL, CH) = DAG.SplitVector(Cond, dl); + else + SplitVecRes_SETCC(Cond.getNode(), CL, CH); + } else std::tie(CL, CH) = DAG.SplitVector(Cond, dl); } diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 1598e4dfefd83d..ff806bdb822c27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -1188,6 +1188,10 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) { if (!Pred.isArtificial()) AddPredQueued(NewSU, Pred); + // Make sure the clone comes after the original. (InstrEmitter assumes + // this ordering.) + AddPredQueued(NewSU, SDep(SU, SDep::Artificial)); + // Only copy scheduled successors. Cut them from old node's successor // list and move them over. SmallVector, 4> DelDeps; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 0c55ff73c3b740..52a71b91d93f69 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7756,12 +7756,16 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) { case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; break; case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; break; case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; break; + case ISD::STRICT_LRINT: NewOpc = ISD::LRINT; break; + case ISD::STRICT_LLRINT: NewOpc = ISD::LLRINT; break; case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; break; case ISD::STRICT_FNEARBYINT: NewOpc = ISD::FNEARBYINT; break; case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break; case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break; case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; break; case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; break; + case ISD::STRICT_LROUND: NewOpc = ISD::LROUND; break; + case ISD::STRICT_LLROUND: NewOpc = ISD::LLROUND; break; case ISD::STRICT_FROUND: NewOpc = ISD::FROUND; break; case ISD::STRICT_FTRUNC: NewOpc = ISD::FTRUNC; break; case ISD::STRICT_FP_ROUND: NewOpc = ISD::FP_ROUND; break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ff6358b442ad15..5380630eabf6fc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2622,17 +2622,11 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, // Subtract the minimum value. SDValue SwitchOp = getValue(B.SValue); EVT VT = SwitchOp.getValueType(); - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp, - DAG.getConstant(B.First, dl, VT)); - - // Check range. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue RangeCmp = DAG.getSetCC( - dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), - Sub.getValueType()), - Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT); + SDValue RangeSub = + DAG.getNode(ISD::SUB, dl, VT, SwitchOp, DAG.getConstant(B.First, dl, VT)); // Determine the type of the test operands. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool UsePtrType = false; if (!TLI.isTypeLegal(VT)) { UsePtrType = true; @@ -2645,6 +2639,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, break; } } + SDValue Sub = RangeSub; if (UsePtrType) { VT = TLI.getPointerTy(DAG.getDataLayout()); Sub = DAG.getZExtOrTrunc(Sub, dl, VT); @@ -2656,20 +2651,29 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B, MachineBasicBlock* MBB = B.Cases[0].ThisBB; - addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb); + if (!B.OmitRangeCheck) + addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb); addSuccessorWithProb(SwitchBB, MBB, B.Prob); SwitchBB->normalizeSuccProbs(); - SDValue BrRange = DAG.getNode(ISD::BRCOND, dl, - MVT::Other, CopyTo, RangeCmp, - DAG.getBasicBlock(B.Default)); + SDValue Root = CopyTo; + if (!B.OmitRangeCheck) { + // Conditional branch to the default block. + SDValue RangeCmp = DAG.getSetCC(dl, + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + RangeSub.getValueType()), + RangeSub, DAG.getConstant(B.Range, dl, RangeSub.getValueType()), + ISD::SETUGT); + + Root = DAG.getNode(ISD::BRCOND, dl, MVT::Other, Root, RangeCmp, + DAG.getBasicBlock(B.Default)); + } // Avoid emitting unnecessary branches to the next block. if (MBB != NextBlock(SwitchBB)) - BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange, - DAG.getBasicBlock(MBB)); + Root = DAG.getNode(ISD::BR, dl, MVT::Other, Root, DAG.getBasicBlock(MBB)); - DAG.setRoot(BrRange); + DAG.setRoot(Root); } /// visitBitTestCase - this function produces one "bit test" @@ -4668,10 +4672,11 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) { L = DAG.getPtrExtOrTrunc(L, dl, VT); setValue(&I, L); - if (!I.isUnordered()) { - SDValue OutChain = L.getValue(1); + SDValue OutChain = L.getValue(1); + if (!I.isUnordered()) DAG.setRoot(OutChain); - } + else + PendingLoads.push_back(OutChain); return; } @@ -6099,12 +6104,16 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_maxnum: case Intrinsic::experimental_constrained_minnum: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: visitConstrainedFPIntrinsic(cast(I)); @@ -6930,6 +6939,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( case Intrinsic::experimental_constrained_log2: Opcode = ISD::STRICT_FLOG2; break; + case Intrinsic::experimental_constrained_lrint: + Opcode = ISD::STRICT_LRINT; + break; + case Intrinsic::experimental_constrained_llrint: + Opcode = ISD::STRICT_LLRINT; + break; case Intrinsic::experimental_constrained_rint: Opcode = ISD::STRICT_FRINT; break; @@ -6948,6 +6963,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic( case Intrinsic::experimental_constrained_floor: Opcode = ISD::STRICT_FFLOOR; break; + case Intrinsic::experimental_constrained_lround: + Opcode = ISD::STRICT_LROUND; + break; + case Intrinsic::experimental_constrained_llround: + Opcode = ISD::STRICT_LLROUND; + break; case Intrinsic::experimental_constrained_round: Opcode = ISD::STRICT_FROUND; break; @@ -10164,8 +10185,6 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, break; } case CC_BitTests: { - // FIXME: If Fallthrough is unreachable, skip the range check. - // FIXME: Optimize away range check based on pivot comparisons. BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex]; @@ -10186,6 +10205,11 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond, BTB->DefaultProb -= DefaultProb / 2; } + if (FallthroughUnreachable) { + // Skip the range check if the fallthrough block is unreachable. + BTB->OmitRangeCheck = true; + } + // If we're in the right place, emit the bit test header right now. if (CurMBB == SwitchMBB) { visitBitTestHeader(*BTB, SwitchMBB); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 003dbb233b328c..462b719735d285 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -333,9 +333,13 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FP16_TO_FP: return "fp16_to_fp"; case ISD::FP_TO_FP16: return "fp_to_fp16"; case ISD::LROUND: return "lround"; + case ISD::STRICT_LROUND: return "strict_lround"; case ISD::LLROUND: return "llround"; + case ISD::STRICT_LLROUND: return "strict_llround"; case ISD::LRINT: return "lrint"; + case ISD::STRICT_LRINT: return "strict_lrint"; case ISD::LLRINT: return "llrint"; + case ISD::STRICT_LLRINT: return "strict_llrint"; // Control flow instructions case ISD::BR: return "br"; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index bc005a2cc27be7..1c11ca3286eb60 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -709,10 +709,14 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::STRICT_FLOG, VT, Expand); setOperationAction(ISD::STRICT_FLOG10, VT, Expand); setOperationAction(ISD::STRICT_FLOG2, VT, Expand); + setOperationAction(ISD::STRICT_LRINT, VT, Expand); + setOperationAction(ISD::STRICT_LLRINT, VT, Expand); setOperationAction(ISD::STRICT_FRINT, VT, Expand); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Expand); setOperationAction(ISD::STRICT_FCEIL, VT, Expand); setOperationAction(ISD::STRICT_FFLOOR, VT, Expand); + setOperationAction(ISD::STRICT_LROUND, VT, Expand); + setOperationAction(ISD::STRICT_LLROUND, VT, Expand); setOperationAction(ISD::STRICT_FROUND, VT, Expand); setOperationAction(ISD::STRICT_FTRUNC, VT, Expand); setOperationAction(ISD::STRICT_FMAXNUM, VT, Expand); diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp index 4be6e5a233bcac..8c58254d6e0aa5 100644 --- a/llvm/lib/Demangle/MicrosoftDemangle.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp @@ -2179,7 +2179,7 @@ NodeArrayNode *Demangler::demangleFunctionParameterList(StringView &MangledName, NodeArrayNode * Demangler::demangleTemplateParameterList(StringView &MangledName) { - NodeList *Head; + NodeList *Head = nullptr; NodeList **Current = &Head; size_t Count = 0; diff --git a/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h b/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h index 1271ad962b389b..b47a798c7603bb 100644 --- a/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h @@ -20,24 +20,23 @@ namespace jitlink { template class BasicGOTAndStubsBuilder { public: - BasicGOTAndStubsBuilder(AtomGraph &G) : G(G) {} + BasicGOTAndStubsBuilder(LinkGraph &G) : G(G) {} void run() { - // We're going to be adding new atoms, but we don't want to iterate over - // the newly added ones, so just copy the existing atoms out. - std::vector DAs(G.defined_atoms().begin(), - G.defined_atoms().end()); + // We're going to be adding new blocks, but we don't want to iterate over + // the newly added ones, so just copy the existing blocks out. + std::vector Blocks(G.blocks().begin(), G.blocks().end()); - for (auto *DA : DAs) - for (auto &E : DA->edges()) + for (auto *B : Blocks) + for (auto &E : B->edges()) if (impl().isGOTEdge(E)) - impl().fixGOTEdge(E, getGOTEntryAtom(E.getTarget())); + impl().fixGOTEdge(E, getGOTEntrySymbol(E.getTarget())); else if (impl().isExternalBranchEdge(E)) - impl().fixExternalBranchEdge(E, getStubAtom(E.getTarget())); + impl().fixExternalBranchEdge(E, getStubSymbol(E.getTarget())); } protected: - Atom &getGOTEntryAtom(Atom &Target) { + Symbol &getGOTEntrySymbol(Symbol &Target) { assert(Target.hasName() && "GOT edge cannot point to anonymous target"); auto GOTEntryI = GOTEntries.find(Target.getName()); @@ -49,31 +48,31 @@ template class BasicGOTAndStubsBuilder { GOTEntries.insert(std::make_pair(Target.getName(), &GOTEntry)).first; } - assert(GOTEntryI != GOTEntries.end() && "Could not get GOT entry atom"); + assert(GOTEntryI != GOTEntries.end() && "Could not get GOT entry symbol"); return *GOTEntryI->second; } - Atom &getStubAtom(Atom &Target) { + Symbol &getStubSymbol(Symbol &Target) { assert(Target.hasName() && "External branch edge can not point to an anonymous target"); auto StubI = Stubs.find(Target.getName()); if (StubI == Stubs.end()) { - auto &StubAtom = impl().createStub(Target); - StubI = Stubs.insert(std::make_pair(Target.getName(), &StubAtom)).first; + auto &StubSymbol = impl().createStub(Target); + StubI = Stubs.insert(std::make_pair(Target.getName(), &StubSymbol)).first; } - assert(StubI != Stubs.end() && "Count not get stub atom"); + assert(StubI != Stubs.end() && "Count not get stub symbol"); return *StubI->second; } - AtomGraph &G; + LinkGraph &G; private: BuilderImpl &impl() { return static_cast(*this); } - DenseMap GOTEntries; - DenseMap Stubs; + DenseMap GOTEntries; + DenseMap Stubs; }; } // end namespace jitlink diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt index e81648311cf38e..ad3427fdfe316d 100644 --- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt @@ -5,7 +5,7 @@ add_llvm_library(LLVMJITLink EHFrameSupport.cpp MachO.cpp MachO_x86_64.cpp - MachOAtomGraphBuilder.cpp + MachOLinkGraphBuilder.cpp DEPENDS intrinsics_gen diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp index f373f2d92b0f09..f80b0e7f890958 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp @@ -17,18 +17,14 @@ namespace llvm { namespace jitlink { -EHFrameParser::EHFrameParser(AtomGraph &G, Section &EHFrameSection, - StringRef EHFrameContent, - JITTargetAddress EHFrameAddress, - Edge::Kind FDEToCIERelocKind, - Edge::Kind FDEToTargetRelocKind) - : G(G), EHFrameSection(EHFrameSection), EHFrameContent(EHFrameContent), - EHFrameAddress(EHFrameAddress), - EHFrameReader(EHFrameContent, G.getEndianness()), - FDEToCIERelocKind(FDEToCIERelocKind), - FDEToTargetRelocKind(FDEToTargetRelocKind) {} - -Error EHFrameParser::atomize() { +EHFrameBinaryParser::EHFrameBinaryParser(JITTargetAddress EHFrameAddress, + StringRef EHFrameContent, + unsigned PointerSize, + support::endianness Endianness) + : EHFrameAddress(EHFrameAddress), EHFrameContent(EHFrameContent), + PointerSize(PointerSize), EHFrameReader(EHFrameContent, Endianness) {} + +Error EHFrameBinaryParser::addToGraph() { while (!EHFrameReader.empty()) { size_t RecordOffset = EHFrameReader.getOffset(); @@ -38,44 +34,39 @@ Error EHFrameParser::atomize() { << " (offset " << RecordOffset << ")\n"; }); - size_t CIELength = 0; - uint32_t CIELengthField; - if (auto Err = EHFrameReader.readInteger(CIELengthField)) + size_t RecordLength = 0; + uint32_t RecordLengthField; + if (auto Err = EHFrameReader.readInteger(RecordLengthField)) return Err; - // Process CIE length/extended-length fields to build the atom. + // Process CIE/FDE length/extended-length fields to build the blocks. // // The value of these fields describe the length of the *rest* of the CIE // (not including data up to the end of the field itself) so we have to - // bump CIELength to include the data up to the end of the field: 4 bytes + // bump RecordLength to include the data up to the end of the field: 4 bytes // for Length, or 12 bytes (4 bytes + 8 bytes) for ExtendedLength. - if (CIELengthField == 0) // Length 0 means end of __eh_frame section. + if (RecordLengthField == 0) // Length 0 means end of __eh_frame section. break; // If the regular length field's value is 0xffffffff, use extended length. - if (CIELengthField == 0xffffffff) { - uint64_t CIEExtendedLengthField; - if (auto Err = EHFrameReader.readInteger(CIEExtendedLengthField)) + if (RecordLengthField == 0xffffffff) { + uint64_t ExtendedLengthField; + if (auto Err = EHFrameReader.readInteger(ExtendedLengthField)) return Err; - if (CIEExtendedLengthField > EHFrameReader.bytesRemaining()) + if (ExtendedLengthField > EHFrameReader.bytesRemaining()) return make_error("CIE record extends past the end of " "the __eh_frame section"); - if (CIEExtendedLengthField + 12 > std::numeric_limits::max()) + if (ExtendedLengthField + 12 > std::numeric_limits::max()) return make_error("CIE record too large to process"); - CIELength = CIEExtendedLengthField + 12; + RecordLength = ExtendedLengthField + 12; } else { - if (CIELengthField > EHFrameReader.bytesRemaining()) + if (RecordLengthField > EHFrameReader.bytesRemaining()) return make_error("CIE record extends past the end of " "the __eh_frame section"); - CIELength = CIELengthField + 4; + RecordLength = RecordLengthField + 4; } - LLVM_DEBUG(dbgs() << " length: " << CIELength << "\n"); - - // Add an atom for this record. - CurRecordAtom = &G.addAnonymousAtom( - EHFrameSection, EHFrameAddress + RecordOffset, G.getPointerSize()); - CurRecordAtom->setContent(EHFrameContent.substr(RecordOffset, CIELength)); + LLVM_DEBUG(dbgs() << " length: " << RecordLength << "\n"); // Read the CIE Pointer. size_t CIEPointerAddress = EHFrameAddress + EHFrameReader.getOffset(); @@ -85,21 +76,24 @@ Error EHFrameParser::atomize() { // Based on the CIE pointer value, parse this as a CIE or FDE record. if (CIEPointer == 0) { - if (auto Err = processCIE()) + if (auto Err = processCIE(RecordOffset, RecordLength)) return Err; } else { - if (auto Err = processFDE(CIEPointerAddress, CIEPointer)) + if (auto Err = processFDE(RecordOffset, RecordLength, CIEPointerAddress, + CIEPointer)) return Err; } - EHFrameReader.setOffset(RecordOffset + CIELength); + EHFrameReader.setOffset(RecordOffset + RecordLength); } return Error::success(); } -Expected -EHFrameParser::parseAugmentationString() { +void EHFrameBinaryParser::anchor() {} + +Expected +EHFrameBinaryParser::parseAugmentationString() { AugmentationInfo AugInfo; uint8_t NextChar; uint8_t *NextField = &AugInfo.Fields[0]; @@ -139,14 +133,14 @@ EHFrameParser::parseAugmentationString() { return std::move(AugInfo); } -Expected EHFrameParser::readAbsolutePointer() { +Expected EHFrameBinaryParser::readAbsolutePointer() { static_assert(sizeof(JITTargetAddress) == sizeof(uint64_t), "Result must be able to hold a uint64_t"); JITTargetAddress Addr; - if (G.getPointerSize() == 8) { + if (PointerSize == 8) { if (auto Err = EHFrameReader.readInteger(Addr)) return std::move(Err); - } else if (G.getPointerSize() == 4) { + } else if (PointerSize == 4) { uint32_t Addr32; if (auto Err = EHFrameReader.readInteger(Addr32)) return std::move(Err); @@ -156,14 +150,19 @@ Expected EHFrameParser::readAbsolutePointer() { return Addr; } -Error EHFrameParser::processCIE() { +Error EHFrameBinaryParser::processCIE(size_t RecordOffset, + size_t RecordLength) { // Use the dwarf namespace for convenient access to pointer encoding // constants. using namespace dwarf; LLVM_DEBUG(dbgs() << " Record is CIE\n"); - CIEInformation CIEInfo(*CurRecordAtom); + auto &CIESymbol = + createCIERecord(EHFrameAddress + RecordOffset, + EHFrameContent.substr(RecordOffset, RecordLength)); + + CIEInformation CIEInfo(CIESymbol); uint8_t Version = 0; if (auto Err = EHFrameReader.readInteger(Version)) @@ -179,7 +178,7 @@ Error EHFrameParser::processCIE() { // Skip the EH Data field if present. if (AugInfo->EHDataFieldPresent) - if (auto Err = EHFrameReader.skip(G.getPointerSize())) + if (auto Err = EHFrameReader.skip(PointerSize)) return Err; // Read and sanity check the code alignment factor. @@ -226,7 +225,7 @@ Error EHFrameParser::processCIE() { return make_error( "Unsupported LSDA pointer encoding " + formatv("{0:x2}", LSDAPointerEncoding) + " in CIE at " + - formatv("{0:x16}", CurRecordAtom->getAddress())); + formatv("{0:x16}", CIESymbol.getAddress())); break; } case 'P': { @@ -239,7 +238,7 @@ Error EHFrameParser::processCIE() { "Unspported personality pointer " "encoding " + formatv("{0:x2}", PersonalityPointerEncoding) + " in CIE at " + - formatv("{0:x16}", CurRecordAtom->getAddress())); + formatv("{0:x16}", CIESymbol.getAddress())); uint32_t PersonalityPointerAddress; if (auto Err = EHFrameReader.readInteger(PersonalityPointerAddress)) return Err; @@ -254,7 +253,7 @@ Error EHFrameParser::processCIE() { "Unsupported FDE address pointer " "encoding " + formatv("{0:x2}", FDEPointerEncoding) + " in CIE at " + - formatv("{0:x16}", CurRecordAtom->getAddress())); + formatv("{0:x16}", CIESymbol.getAddress())); break; } default: @@ -267,15 +266,16 @@ Error EHFrameParser::processCIE() { return make_error("Read past the end of the augmentation " "data while parsing fields"); - assert(!CIEInfos.count(CurRecordAtom->getAddress()) && + assert(!CIEInfos.count(CIESymbol.getAddress()) && "Multiple CIEs recorded at the same address?"); - CIEInfos[CurRecordAtom->getAddress()] = std::move(CIEInfo); + CIEInfos[CIESymbol.getAddress()] = std::move(CIEInfo); return Error::success(); } -Error EHFrameParser::processFDE(JITTargetAddress CIEPointerAddress, - uint32_t CIEPointer) { +Error EHFrameBinaryParser::processFDE(size_t RecordOffset, size_t RecordLength, + JITTargetAddress CIEPointerAddress, + uint32_t CIEPointer) { LLVM_DEBUG(dbgs() << " Record is FDE\n"); LLVM_DEBUG({ @@ -286,16 +286,11 @@ Error EHFrameParser::processFDE(JITTargetAddress CIEPointerAddress, auto CIEInfoItr = CIEInfos.find(CIEPointerAddress - CIEPointer); if (CIEInfoItr == CIEInfos.end()) return make_error( - "FDE at " + formatv("{0:x16}", CurRecordAtom->getAddress()) + + "FDE at " + formatv("{0:x16}", EHFrameAddress + RecordOffset) + " points to non-existant CIE at " + formatv("{0:x16}", CIEPointerAddress - CIEPointer)); auto &CIEInfo = CIEInfoItr->second; - // The CIEPointer looks good. Add a relocation. - CurRecordAtom->addEdge(FDEToCIERelocKind, - CIEPointerAddress - CurRecordAtom->getAddress(), - *CIEInfo.CIEAtom, 0); - // Read and sanity check the PC-start pointer and size. JITTargetAddress PCBeginAddress = EHFrameAddress + EHFrameReader.getOffset(); @@ -305,83 +300,68 @@ Error EHFrameParser::processFDE(JITTargetAddress CIEPointerAddress, JITTargetAddress PCBegin = PCBeginAddress + *PCBeginDelta; LLVM_DEBUG({ - dbgs() << " PC begin: " << format("0x%016" PRIx64, PCBegin) << "\n"; + dbgs() << " PC begin: " << format("0x%016" PRIx64, PCBegin) << "\n"; }); - auto *TargetAtom = G.getAtomByAddress(PCBegin); + auto *TargetSymbol = getSymbolAtAddress(PCBegin); - if (!TargetAtom) + if (!TargetSymbol) return make_error("FDE PC-begin " + formatv("{0:x16}", PCBegin) + - " does not point at atom"); + " does not point at symbol"); - if (TargetAtom->getAddress() != PCBegin) + if (TargetSymbol->getAddress() != PCBegin) return make_error( "FDE PC-begin " + formatv("{0:x16}", PCBegin) + - " does not point to start of atom at " + - formatv("{0:x16}", TargetAtom->getAddress())); - - LLVM_DEBUG(dbgs() << " FDE target: " << *TargetAtom << "\n"); + " does not point to start of symbol at " + + formatv("{0:x16}", TargetSymbol->getAddress())); - // The PC-start pointer and size look good. Add relocations. - CurRecordAtom->addEdge(FDEToTargetRelocKind, - PCBeginAddress - CurRecordAtom->getAddress(), - *TargetAtom, 0); - - // Add a keep-alive relocation from the function to the FDE to ensure it is - // not dead stripped. - TargetAtom->addEdge(Edge::KeepAlive, 0, *CurRecordAtom, 0); + LLVM_DEBUG(dbgs() << " FDE target: " << *TargetSymbol << "\n"); // Skip over the PC range size field. - if (auto Err = EHFrameReader.skip(G.getPointerSize())) + if (auto Err = EHFrameReader.skip(PointerSize)) return Err; + Symbol *LSDASymbol = nullptr; + JITTargetAddress LSDAAddress = 0; if (CIEInfo.FDEsHaveLSDAField) { uint64_t AugmentationDataSize; if (auto Err = EHFrameReader.readULEB128(AugmentationDataSize)) return Err; - if (AugmentationDataSize != G.getPointerSize()) + if (AugmentationDataSize != PointerSize) return make_error( "Unexpected FDE augmentation data size (expected " + - Twine(G.getPointerSize()) + ", got " + Twine(AugmentationDataSize) + - ") for FDE at " + formatv("{0:x16}", CurRecordAtom->getAddress())); - JITTargetAddress LSDAAddress = EHFrameAddress + EHFrameReader.getOffset(); + Twine(PointerSize) + ", got " + Twine(AugmentationDataSize) + + ") for FDE at " + formatv("{0:x16}", EHFrameAddress + RecordOffset)); + LSDAAddress = EHFrameAddress + EHFrameReader.getOffset(); auto LSDADelta = readAbsolutePointer(); if (!LSDADelta) return LSDADelta.takeError(); JITTargetAddress LSDA = LSDAAddress + *LSDADelta; - auto *LSDAAtom = G.getAtomByAddress(LSDA); + LSDASymbol = getSymbolAtAddress(LSDA); - if (!LSDAAtom) + if (!LSDASymbol) return make_error("FDE LSDA " + formatv("{0:x16}", LSDA) + - " does not point at atom"); + " does not point at symbol"); - if (LSDAAtom->getAddress() != LSDA) + if (LSDASymbol->getAddress() != LSDA) return make_error( "FDE LSDA " + formatv("{0:x16}", LSDA) + - " does not point to start of atom at " + - formatv("{0:x16}", LSDAAtom->getAddress())); - - LLVM_DEBUG(dbgs() << " FDE LSDA: " << *LSDAAtom << "\n"); + " does not point to start of symbol at " + + formatv("{0:x16}", LSDASymbol->getAddress())); - // LSDA looks good. Add relocations. - CurRecordAtom->addEdge(FDEToTargetRelocKind, - LSDAAddress - CurRecordAtom->getAddress(), *LSDAAtom, - 0); + LLVM_DEBUG(dbgs() << " FDE LSDA: " << *LSDASymbol << "\n"); } - return Error::success(); -} + JITTargetAddress RecordAddress = EHFrameAddress + RecordOffset; + auto FDESymbol = createFDERecord( + RecordAddress, EHFrameContent.substr(RecordOffset, RecordLength), + *CIEInfo.CIESymbol, CIEPointerAddress - RecordAddress, *TargetSymbol, + PCBeginAddress - RecordAddress, LSDASymbol, LSDAAddress - RecordAddress); -Error addEHFrame(AtomGraph &G, Section &EHFrameSection, - StringRef EHFrameContent, JITTargetAddress EHFrameAddress, - Edge::Kind FDEToCIERelocKind, - Edge::Kind FDEToTargetRelocKind) { - return EHFrameParser(G, EHFrameSection, EHFrameContent, EHFrameAddress, - FDEToCIERelocKind, FDEToTargetRelocKind) - .atomize(); + return FDESymbol.takeError(); } // Determine whether we can register EH tables. @@ -523,7 +503,7 @@ InProcessEHFrameRegistrar &InProcessEHFrameRegistrar::getInstance() { InProcessEHFrameRegistrar::InProcessEHFrameRegistrar() {} -AtomGraphPassFunction +LinkGraphPassFunction createEHFrameRecorderPass(const Triple &TT, StoreFrameRangeFunction StoreRangeAddress) { const char *EHFrameSectionName = nullptr; @@ -533,14 +513,14 @@ createEHFrameRecorderPass(const Triple &TT, EHFrameSectionName = ".eh_frame"; auto RecordEHFrame = - [EHFrameSectionName, - StoreFrameRange = std::move(StoreRangeAddress)](AtomGraph &G) -> Error { - // Search for a non-empty eh-frame and record the address of the first atom - // in it. + [EHFrameSectionName, + StoreFrameRange = std::move(StoreRangeAddress)](LinkGraph &G) -> Error { + // Search for a non-empty eh-frame and record the address of the first + // symbol in it. JITTargetAddress Addr = 0; size_t Size = 0; if (auto *S = G.findSectionByName(EHFrameSectionName)) { - auto R = S->getRange(); + auto R = SectionRange(*S); Addr = R.getStart(); Size = R.getSize(); } diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h index d679edef7ea6f1..6f9f68ad8382c5 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h @@ -21,18 +21,31 @@ namespace llvm { namespace jitlink { -/// A generic parser for eh-frame sections. +/// A generic binary parser for eh-frame sections. /// -/// Adds atoms representing CIE and FDE entries, using the given FDE-to-CIE and -/// FDEToTarget relocation kinds. -class EHFrameParser { +/// Adds blocks and symbols representing CIE and FDE entries to a JITLink graph. +/// +/// This parser assumes that the user has already verified that the EH-frame's +/// address range does not overlap any other section/symbol, so that generated +/// CIE/FDE records do not overlap other sections/symbols. +class EHFrameBinaryParser { public: - EHFrameParser(AtomGraph &G, Section &EHFrameSection, StringRef EHFrameContent, - JITTargetAddress EHFrameAddress, Edge::Kind FDEToCIERelocKind, - Edge::Kind FDEToTargetRelocKind); - Error atomize(); + EHFrameBinaryParser(JITTargetAddress EHFrameAddress, StringRef EHFrameContent, + unsigned PointerSize, support::endianness Endianness); + virtual ~EHFrameBinaryParser() {} + + Error addToGraph(); private: + virtual void anchor(); + virtual Symbol *getSymbolAtAddress(JITTargetAddress Addr) = 0; + virtual Symbol &createCIERecord(JITTargetAddress RecordAddr, + StringRef RecordContent) = 0; + virtual Expected + createFDERecord(JITTargetAddress RecordAddr, StringRef RecordContent, + Symbol &CIE, size_t CIEOffset, Symbol &Func, + size_t FuncOffset, Symbol *LSDA, size_t LSDAOffset) = 0; + struct AugmentationInfo { bool AugmentationDataPresent = false; bool EHDataFieldPresent = false; @@ -41,31 +54,24 @@ class EHFrameParser { Expected parseAugmentationString(); Expected readAbsolutePointer(); - Error processCIE(); - Error processFDE(JITTargetAddress CIEPointerAddress, uint32_t CIEPointer); + Error processCIE(size_t RecordOffset, size_t RecordLength); + Error processFDE(size_t RecordOffset, size_t RecordLength, + JITTargetAddress CIEPointerOffset, uint32_t CIEPointer); struct CIEInformation { CIEInformation() = default; - CIEInformation(DefinedAtom &CIEAtom) : CIEAtom(&CIEAtom) {} - DefinedAtom *CIEAtom = nullptr; + CIEInformation(Symbol &CIESymbol) : CIESymbol(&CIESymbol) {} + Symbol *CIESymbol = nullptr; bool FDEsHaveLSDAField = false; }; - AtomGraph &G; - Section &EHFrameSection; - StringRef EHFrameContent; JITTargetAddress EHFrameAddress; + StringRef EHFrameContent; + unsigned PointerSize; BinaryStreamReader EHFrameReader; - DefinedAtom *CurRecordAtom = nullptr; DenseMap CIEInfos; - Edge::Kind FDEToCIERelocKind; - Edge::Kind FDEToTargetRelocKind; }; -Error addEHFrame(AtomGraph &G, Section &EHFrameSection, - StringRef EHFrameContent, JITTargetAddress EHFrameAddress, - Edge::Kind FDEToCIERelocKind, Edge::Kind FDEToTargetRelocKind); - } // end namespace jitlink } // end namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp index 9d0a7459dc0927..1e19038951ac24 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp @@ -56,95 +56,151 @@ std::error_code JITLinkError::convertToErrorCode() const { return std::error_code(GenericJITLinkError, *JITLinkerErrorCategory); } -const StringRef getGenericEdgeKindName(Edge::Kind K) { +const char *getGenericEdgeKindName(Edge::Kind K) { switch (K) { case Edge::Invalid: return "INVALID RELOCATION"; case Edge::KeepAlive: return "Keep-Alive"; - case Edge::LayoutNext: - return "Layout-Next"; default: llvm_unreachable("Unrecognized relocation kind"); } } -raw_ostream &operator<<(raw_ostream &OS, const Atom &A) { +const char *getLinkageName(Linkage L) { + switch (L) { + case Linkage::Strong: + return "strong"; + case Linkage::Weak: + return "weak"; + } + llvm_unreachable("Unrecognized llvm.jitlink.Linkage enum"); +} + +const char *getScopeName(Scope S) { + switch (S) { + case Scope::Default: + return "default"; + case Scope::Hidden: + return "hidden"; + case Scope::Local: + return "local"; + } + llvm_unreachable("Unrecognized llvm.jitlink.Scope enum"); +} + +raw_ostream &operator<<(raw_ostream &OS, const Block &B) { + return OS << formatv("{0:x16}", B.getAddress()) << " -- " + << formatv("{0:x16}", B.getAddress() + B.getSize()) << ": " + << (B.isZeroFill() ? "zero-fill" : "content") + << ", align = " << B.getAlignment() + << ", align-ofs = " << B.getAlignmentOffset() + << ", section = " << B.getSection().getName(); +} + +raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) { OS << "<"; - if (A.getName().empty()) - OS << "anon@" << format("0x%016" PRIx64, A.getAddress()); + if (Sym.getName().empty()) + OS << "*anon*"; else - OS << A.getName(); - OS << " ["; - if (A.isDefined()) { - auto &DA = static_cast(A); - OS << " section=" << DA.getSection().getName(); - if (DA.isLive()) - OS << " live"; - if (DA.shouldDiscard()) - OS << " should-discard"; - } else - OS << " external"; - OS << " ]>"; + OS << Sym.getName(); + OS << ": flags = "; + switch (Sym.getLinkage()) { + case Linkage::Strong: + OS << 'S'; + break; + case Linkage::Weak: + OS << 'W'; + break; + } + switch (Sym.getScope()) { + case Scope::Default: + OS << 'D'; + break; + case Scope::Hidden: + OS << 'H'; + break; + case Scope::Local: + OS << 'L'; + break; + } + OS << (Sym.isLive() ? '+' : '-') + << ", size = " << formatv("{0:x8}", Sym.getSize()) + << ", addr = " << formatv("{0:x16}", Sym.getAddress()) << " (" + << formatv("{0:x16}", Sym.getAddressable().getAddress()) << " + " + << formatv("{0:x8}", Sym.getOffset()); + if (Sym.isDefined()) + OS << " " << Sym.getBlock().getSection().getName(); + OS << ")>"; return OS; } -void printEdge(raw_ostream &OS, const Atom &FixupAtom, const Edge &E, +void printEdge(raw_ostream &OS, const Block &B, const Edge &E, StringRef EdgeKindName) { - OS << "edge@" << formatv("{0:x16}", FixupAtom.getAddress() + E.getOffset()) - << ": " << FixupAtom << " + " << E.getOffset() << " -- " << EdgeKindName - << " -> " << E.getTarget() << " + " << E.getAddend(); + OS << "edge@" << formatv("{0:x16}", B.getAddress() + E.getOffset()) << ": " + << formatv("{0:x16}", B.getAddress()) << " + " << E.getOffset() << " -- " + << EdgeKindName << " -> " << E.getTarget() << " + " << E.getAddend(); } Section::~Section() { - for (auto *DA : DefinedAtoms) - DA->~DefinedAtom(); + for (auto *Sym : Symbols) + Sym->~Symbol(); } -void AtomGraph::dump(raw_ostream &OS, +LinkGraph::~LinkGraph() { + // Destroy blocks. + for (auto *B : Blocks) + B->~Block(); +} + +void LinkGraph::dump(raw_ostream &OS, std::function EdgeKindToName) { if (!EdgeKindToName) EdgeKindToName = [](Edge::Kind K) { return StringRef(); }; - OS << "Defined atoms:\n"; - for (auto *DA : defined_atoms()) { - OS << " " << format("0x%016" PRIx64, DA->getAddress()) << ": " << *DA + OS << "Symbols:\n"; + for (auto *Sym : defined_symbols()) { + OS << " " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym << "\n"; - for (auto &E : DA->edges()) { - OS << " "; - StringRef EdgeName = (E.getKind() < Edge::FirstRelocation - ? getGenericEdgeKindName(E.getKind()) - : EdgeKindToName(E.getKind())); - - if (!EdgeName.empty()) - printEdge(OS, *DA, E, EdgeName); - else { - auto EdgeNumberString = std::to_string(E.getKind()); - printEdge(OS, *DA, E, EdgeNumberString); + if (Sym->isDefined()) { + for (auto &E : Sym->getBlock().edges()) { + OS << " "; + StringRef EdgeName = (E.getKind() < Edge::FirstRelocation + ? getGenericEdgeKindName(E.getKind()) + : EdgeKindToName(E.getKind())); + + if (!EdgeName.empty()) + printEdge(OS, Sym->getBlock(), E, EdgeName); + else { + auto EdgeNumberString = std::to_string(E.getKind()); + printEdge(OS, Sym->getBlock(), E, EdgeNumberString); + } + OS << "\n"; } - OS << "\n"; } } - OS << "Absolute atoms:\n"; - for (auto *A : absolute_atoms()) - OS << " " << format("0x%016" PRIx64, A->getAddress()) << ": " << *A + OS << "Absolute symbols:\n"; + for (auto *Sym : absolute_symbols()) + OS << " " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym << "\n"; - OS << "External atoms:\n"; - for (auto *A : external_atoms()) - OS << " " << format("0x%016" PRIx64, A->getAddress()) << ": " << *A + OS << "External symbols:\n"; + for (auto *Sym : external_symbols()) + OS << " " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym << "\n"; } +void JITLinkAsyncLookupContinuation::anchor() {} + JITLinkContext::~JITLinkContext() {} bool JITLinkContext::shouldAddDefaultTargetPasses(const Triple &TT) const { return true; } -AtomGraphPassFunction JITLinkContext::getMarkLivePass(const Triple &TT) const { - return AtomGraphPassFunction(); +LinkGraphPassFunction JITLinkContext::getMarkLivePass(const Triple &TT) const { + return LinkGraphPassFunction(); } Error JITLinkContext::modifyPassConfig(const Triple &TT, @@ -152,9 +208,9 @@ Error JITLinkContext::modifyPassConfig(const Triple &TT, return Error::success(); } -Error markAllAtomsLive(AtomGraph &G) { - for (auto *DA : G.defined_atoms()) - DA->setLive(true); +Error markAllSymbolsLive(LinkGraph &G) { + for (auto *Sym : G.defined_symbols()) + Sym->setLive(true); return Error::success(); } diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp index 877107ffe25814..d4270b5aa79671 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "JITLinkGeneric.h" -#include "EHFrameSupportImpl.h" #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/MemoryBuffer.h" @@ -25,7 +24,7 @@ JITLinkerBase::~JITLinkerBase() {} void JITLinkerBase::linkPhase1(std::unique_ptr Self) { - // Build the atom graph. + // Build the link graph. if (auto GraphOrErr = buildGraph(Ctx->getObjectBuffer())) G = std::move(*GraphOrErr); else @@ -33,33 +32,33 @@ void JITLinkerBase::linkPhase1(std::unique_ptr Self) { assert(G && "Graph should have been created by buildGraph above"); // Prune and optimize the graph. - if (auto Err = runPasses(Passes.PrePrunePasses, *G)) + if (auto Err = runPasses(Passes.PrePrunePasses)) return Ctx->notifyFailed(std::move(Err)); LLVM_DEBUG({ - dbgs() << "Atom graph \"" << G->getName() << "\" pre-pruning:\n"; + dbgs() << "Link graph \"" << G->getName() << "\" pre-pruning:\n"; dumpGraph(dbgs()); }); prune(*G); LLVM_DEBUG({ - dbgs() << "Atom graph \"" << G->getName() << "\" post-pruning:\n"; + dbgs() << "Link graph \"" << G->getName() << "\" post-pruning:\n"; dumpGraph(dbgs()); }); // Run post-pruning passes. - if (auto Err = runPasses(Passes.PostPrunePasses, *G)) + if (auto Err = runPasses(Passes.PostPrunePasses)) return Ctx->notifyFailed(std::move(Err)); - // Sort atoms into segments. - layOutAtoms(); + // Sort blocks into segments. + auto Layout = layOutBlocks(); // Allocate memory for segments. if (auto Err = allocateSegments(Layout)) return Ctx->notifyFailed(std::move(Err)); - // Notify client that the defined atoms have been assigned addresses. + // Notify client that the defined symbols have been assigned addresses. Ctx->notifyResolved(*G); auto ExternalSymbols = getExternalSymbolNames(); @@ -74,42 +73,42 @@ void JITLinkerBase::linkPhase1(std::unique_ptr Self) { // [Self=std::move(Self)](Expected Result) { // Self->linkPhase2(std::move(Self), std::move(Result)); // }); - // - // FIXME: Use move capture once we have c++14. auto *TmpCtx = Ctx.get(); - auto *UnownedSelf = Self.release(); - auto Phase2Continuation = - [UnownedSelf](Expected LookupResult) { - std::unique_ptr Self(UnownedSelf); - UnownedSelf->linkPhase2(std::move(Self), std::move(LookupResult)); - }; - TmpCtx->lookup(std::move(ExternalSymbols), std::move(Phase2Continuation)); + TmpCtx->lookup(std::move(ExternalSymbols), + createLookupContinuation( + [S = std::move(Self), L = std::move(Layout)]( + Expected LookupResult) mutable { + auto &TmpSelf = *S; + TmpSelf.linkPhase2(std::move(S), std::move(LookupResult), + std::move(L)); + })); } void JITLinkerBase::linkPhase2(std::unique_ptr Self, - Expected LR) { + Expected LR, + SegmentLayoutMap Layout) { // If the lookup failed, bail out. if (!LR) return deallocateAndBailOut(LR.takeError()); - // Assign addresses to external atoms. + // Assign addresses to external addressables. applyLookupResult(*LR); LLVM_DEBUG({ - dbgs() << "Atom graph \"" << G->getName() << "\" before copy-and-fixup:\n"; + dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n"; dumpGraph(dbgs()); }); - // Copy atom content to working memory and fix up. - if (auto Err = copyAndFixUpAllAtoms(Layout, *Alloc)) + // Copy block content to working memory and fix up. + if (auto Err = copyAndFixUpBlocks(Layout, *Alloc)) return deallocateAndBailOut(std::move(Err)); LLVM_DEBUG({ - dbgs() << "Atom graph \"" << G->getName() << "\" after copy-and-fixup:\n"; + dbgs() << "Link graph \"" << G->getName() << "\" after copy-and-fixup:\n"; dumpGraph(dbgs()); }); - if (auto Err = runPasses(Passes.PostFixupPasses, *G)) + if (auto Err = runPasses(Passes.PostFixupPasses)) return deallocateAndBailOut(std::move(Err)); // FIXME: Use move capture once we have c++14. @@ -128,82 +127,38 @@ void JITLinkerBase::linkPhase3(std::unique_ptr Self, Error Err) { Ctx->notifyFinalized(std::move(Alloc)); } -Error JITLinkerBase::runPasses(AtomGraphPassList &Passes, AtomGraph &G) { +Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) { for (auto &P : Passes) - if (auto Err = P(G)) + if (auto Err = P(*G)) return Err; return Error::success(); } -void JITLinkerBase::layOutAtoms() { - // Group sections by protections, and whether or not they're zero-fill. - for (auto &S : G->sections()) { +JITLinkerBase::SegmentLayoutMap JITLinkerBase::layOutBlocks() { - // Skip empty sections. - if (S.atoms_empty()) - continue; + SegmentLayoutMap Layout; - auto &SL = Layout[S.getProtectionFlags()]; - if (S.isZeroFill()) - SL.ZeroFillSections.push_back(SegmentLayout::SectionLayout(S)); + /// Partition blocks based on permissions and content vs. zero-fill. + for (auto *B : G->blocks()) { + auto &SegLists = Layout[B->getSection().getProtectionFlags()]; + if (!B->isZeroFill()) + SegLists.ContentBlocks.push_back(B); else - SL.ContentSections.push_back(SegmentLayout::SectionLayout(S)); + SegLists.ZeroFillBlocks.push_back(B); } - // Sort sections within the layout by ordinal. - { - auto CompareByOrdinal = [](const SegmentLayout::SectionLayout &LHS, - const SegmentLayout::SectionLayout &RHS) { - return LHS.S->getSectionOrdinal() < RHS.S->getSectionOrdinal(); + /// Sort blocks within each list. + for (auto &KV : Layout) { + + auto CompareBlocks = [](const Block *LHS, const Block *RHS) { + if (LHS->getSection().getOrdinal() != RHS->getSection().getOrdinal()) + return LHS->getSection().getOrdinal() < RHS->getSection().getOrdinal(); + return LHS->getOrdinal() < RHS->getOrdinal(); }; - for (auto &KV : Layout) { - auto &SL = KV.second; - std::sort(SL.ContentSections.begin(), SL.ContentSections.end(), - CompareByOrdinal); - std::sort(SL.ZeroFillSections.begin(), SL.ZeroFillSections.end(), - CompareByOrdinal); - } - } - // Add atoms to the sections. - for (auto &KV : Layout) { - auto &SL = KV.second; - for (auto *SIList : {&SL.ContentSections, &SL.ZeroFillSections}) { - for (auto &SI : *SIList) { - // First build the set of layout-heads (i.e. "heads" of layout-next - // chains) by copying the section atoms, then eliminating any that - // appear as layout-next targets. - DenseSet LayoutHeads; - for (auto *DA : SI.S->atoms()) - LayoutHeads.insert(DA); - - for (auto *DA : SI.S->atoms()) - if (DA->hasLayoutNext()) - LayoutHeads.erase(&DA->getLayoutNext()); - - // Next, sort the layout heads by address order. - std::vector OrderedLayoutHeads; - OrderedLayoutHeads.reserve(LayoutHeads.size()); - for (auto *DA : LayoutHeads) - OrderedLayoutHeads.push_back(DA); - - // Now sort the list of layout heads by address. - std::sort(OrderedLayoutHeads.begin(), OrderedLayoutHeads.end(), - [](const DefinedAtom *LHS, const DefinedAtom *RHS) { - return LHS->getAddress() < RHS->getAddress(); - }); - - // Now populate the SI.Atoms field by appending each of the chains. - for (auto *DA : OrderedLayoutHeads) { - SI.Atoms.push_back(DA); - while (DA->hasLayoutNext()) { - auto &Next = DA->getLayoutNext(); - SI.Atoms.push_back(&Next); - DA = &Next; - } - } - } - } + auto &SegLists = KV.second; + llvm::sort(SegLists.ContentBlocks, CompareBlocks); + llvm::sort(SegLists.ZeroFillBlocks, CompareBlocks); } LLVM_DEBUG({ @@ -213,18 +168,16 @@ void JITLinkerBase::layOutAtoms() { << static_cast(KV.first) << ":\n"; auto &SL = KV.second; for (auto &SIEntry : - {std::make_pair(&SL.ContentSections, "content sections"), - std::make_pair(&SL.ZeroFillSections, "zero-fill sections")}) { - auto &SIList = *SIEntry.first; + {std::make_pair(&SL.ContentBlocks, "content block"), + std::make_pair(&SL.ZeroFillBlocks, "zero-fill block")}) { dbgs() << " " << SIEntry.second << ":\n"; - for (auto &SI : SIList) { - dbgs() << " " << SI.S->getName() << ":\n"; - for (auto *DA : SI.Atoms) - dbgs() << " " << *DA << "\n"; - } + for (auto *B : *SIEntry.first) + dbgs() << " " << *B << "\n"; } } }); + + return Layout; } Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) { @@ -234,61 +187,36 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) { JITLinkMemoryManager::SegmentsRequestMap Segments; for (auto &KV : Layout) { auto &Prot = KV.first; - auto &SegLayout = KV.second; + auto &SegLists = KV.second; + + uint64_t SegAlign = 1; // Calculate segment content size. size_t SegContentSize = 0; - uint32_t SegContentAlign = 1; - for (auto &SI : SegLayout.ContentSections) { - assert(!SI.S->atoms_empty() && "Sections in layout must not be empty"); - assert(!SI.Atoms.empty() && "Section layouts must not be empty"); - - // Bump to section alignment before processing atoms. - SegContentSize = alignTo(SegContentSize, SI.S->getAlignment()); - SegContentAlign = std::max(SegContentAlign, SI.S->getAlignment()); - - for (auto *DA : SI.Atoms) { - SegContentSize = alignTo(SegContentSize, DA->getAlignment()); - SegContentSize += DA->getSize(); - SegContentAlign = std::max(SegContentAlign, DA->getAlignment()); - } + for (auto *B : SegLists.ContentBlocks) { + SegAlign = std::max(SegAlign, B->getAlignment()); + SegContentSize = alignToBlock(SegContentSize, *B); + SegContentSize += B->getSize(); } - // Calculate segment zero-fill size. - uint64_t SegZeroFillSize = 0; - uint32_t SegZeroFillAlign = 1; - - for (auto &SI : SegLayout.ZeroFillSections) { - assert(!SI.S->atoms_empty() && "Sections in layout must not be empty"); - assert(!SI.Atoms.empty() && "Section layouts must not be empty"); - - // Bump to section alignment before processing atoms. - SegZeroFillSize = alignTo(SegZeroFillSize, SI.S->getAlignment()); - SegZeroFillAlign = std::max(SegZeroFillAlign, SI.S->getAlignment()); + uint64_t SegZeroFillStart = SegContentSize; + uint64_t SegZeroFillEnd = SegZeroFillStart; - for (auto *DA : SI.Atoms) { - SegZeroFillSize = alignTo(SegZeroFillSize, DA->getAlignment()); - SegZeroFillSize += DA->getSize(); - SegZeroFillAlign = std::max(SegZeroFillAlign, SI.S->getAlignment()); - } + for (auto *B : SegLists.ZeroFillBlocks) { + SegAlign = std::max(SegAlign, B->getAlignment()); + SegZeroFillEnd = alignToBlock(SegZeroFillEnd, *B); + SegZeroFillEnd += B->getSize(); } - assert(isPowerOf2_32(SegContentAlign) && - "Expected content alignment to be power of 2"); - assert(isPowerOf2_32(SegZeroFillAlign) && - "Expected zero-fill alignment to be power of 2"); - // Round content alignment up to segment alignment. - SegContentAlign = std::max(SegContentAlign, SegZeroFillAlign); - - Segments[Prot] = {SegContentSize, SegContentAlign, SegZeroFillSize, - SegZeroFillAlign}; + Segments[Prot] = {SegAlign, SegContentSize, + SegZeroFillEnd - SegZeroFillStart}; LLVM_DEBUG({ dbgs() << (&KV == &*Layout.begin() ? "" : "; ") - << static_cast(Prot) << ": " - << SegContentSize << " content bytes (alignment " - << SegContentAlign << ") + " << SegZeroFillSize - << " zero-fill bytes (alignment " << SegZeroFillAlign << ")"; + << static_cast(Prot) + << ": alignment = " << SegAlign + << ", content size = " << SegContentSize + << ", zero-fill size = " << (SegZeroFillEnd - SegZeroFillStart); }); } LLVM_DEBUG(dbgs() << " }\n"); @@ -307,22 +235,19 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) { } }); - // Update atom target addresses. + // Update block target addresses. for (auto &KV : Layout) { auto &Prot = KV.first; auto &SL = KV.second; - JITTargetAddress AtomTargetAddr = + JITTargetAddress NextBlockAddr = Alloc->getTargetMemory(static_cast(Prot)); - for (auto *SIList : {&SL.ContentSections, &SL.ZeroFillSections}) - for (auto &SI : *SIList) { - AtomTargetAddr = alignTo(AtomTargetAddr, SI.S->getAlignment()); - for (auto *DA : SI.Atoms) { - AtomTargetAddr = alignTo(AtomTargetAddr, DA->getAlignment()); - DA->setAddress(AtomTargetAddr); - AtomTargetAddr += DA->getSize(); - } + for (auto *SIList : {&SL.ContentBlocks, &SL.ZeroFillBlocks}) + for (auto *B : *SIList) { + NextBlockAddr = alignToBlock(NextBlockAddr, *B); + B->setAddress(NextBlockAddr); + NextBlockAddr += B->getSize(); } } @@ -330,34 +255,35 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) { } DenseSet JITLinkerBase::getExternalSymbolNames() const { - // Identify unresolved external atoms. + // Identify unresolved external symbols. DenseSet UnresolvedExternals; - for (auto *DA : G->external_atoms()) { - assert(DA->getAddress() == 0 && + for (auto *Sym : G->external_symbols()) { + assert(Sym->getAddress() == 0 && "External has already been assigned an address"); - assert(DA->getName() != StringRef() && DA->getName() != "" && + assert(Sym->getName() != StringRef() && Sym->getName() != "" && "Externals must be named"); - UnresolvedExternals.insert(DA->getName()); + UnresolvedExternals.insert(Sym->getName()); } return UnresolvedExternals; } void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) { - for (auto &KV : Result) { - Atom &A = G->getAtomByName(KV.first); - assert(A.getAddress() == 0 && "Atom already resolved"); - A.setAddress(KV.second.getAddress()); + for (auto *Sym : G->external_symbols()) { + assert(Sym->getAddress() == 0 && "Symbol already resolved"); + assert(!Sym->isDefined() && "Symbol being resolved is already defined"); + assert(Result.count(Sym->getName()) && "Missing resolution for symbol"); + Sym->getAddressable().setAddress(Result[Sym->getName()].getAddress()); } LLVM_DEBUG({ dbgs() << "Externals after applying lookup result:\n"; - for (auto *A : G->external_atoms()) - dbgs() << " " << A->getName() << ": " - << formatv("{0:x16}", A->getAddress()) << "\n"; + for (auto *Sym : G->external_symbols()) + dbgs() << " " << Sym->getName() << ": " + << formatv("{0:x16}", Sym->getAddress()) << "\n"; }); - assert(llvm::all_of(G->external_atoms(), - [](Atom *A) { return A->getAddress() != 0; }) && - "All atoms should have been resolved by this point"); + assert(llvm::all_of(G->external_symbols(), + [](Symbol *Sym) { return Sym->getAddress() != 0; }) && + "All symbols should have been resolved by this point"); } void JITLinkerBase::deallocateAndBailOut(Error Err) { @@ -371,96 +297,60 @@ void JITLinkerBase::dumpGraph(raw_ostream &OS) { G->dump(dbgs(), [this](Edge::Kind K) { return getEdgeKindName(K); }); } -void prune(AtomGraph &G) { - std::vector Worklist; - DenseMap> EdgesToUpdate; +void prune(LinkGraph &G) { + std::vector Worklist; + DenseSet VisitedBlocks; - // Build the initial worklist from all atoms initially live. - for (auto *DA : G.defined_atoms()) { - if (!DA->isLive() || DA->shouldDiscard()) - continue; - - for (auto &E : DA->edges()) { - if (!E.getTarget().isDefined()) - continue; + // Build the initial worklist from all symbols initially live. + for (auto *Sym : G.defined_symbols()) + if (Sym->isLive()) + Worklist.push_back(Sym); - auto &EDT = static_cast(E.getTarget()); - - if (EDT.shouldDiscard()) - EdgesToUpdate[&EDT].push_back(&E); - else if (E.isKeepAlive() && !EDT.isLive()) - Worklist.push_back(&EDT); - } - } - - // Propagate live flags to all atoms reachable from the initial live set. + // Propagate live flags to all symbols reachable from the initial live set. while (!Worklist.empty()) { - DefinedAtom &NextLive = *Worklist.back(); + auto *Sym = Worklist.back(); Worklist.pop_back(); - assert(!NextLive.shouldDiscard() && - "should-discard nodes should never make it into the worklist"); + auto &B = Sym->getBlock(); - // If this atom has already been marked as live, or is marked to be - // discarded, then skip it. - if (NextLive.isLive()) + // Skip addressables that we've visited before. + if (VisitedBlocks.count(&B)) continue; - // Otherwise set it as live and add any non-live atoms that it points to - // to the worklist. - NextLive.setLive(true); - - for (auto &E : NextLive.edges()) { - if (!E.getTarget().isDefined()) - continue; - - auto &EDT = static_cast(E.getTarget()); + VisitedBlocks.insert(&B); - if (EDT.shouldDiscard()) - EdgesToUpdate[&EDT].push_back(&E); - else if (E.isKeepAlive() && !EDT.isLive()) - Worklist.push_back(&EDT); + for (auto &E : Sym->getBlock().edges()) { + if (E.getTarget().isDefined() && !E.getTarget().isLive()) { + E.getTarget().setLive(true); + Worklist.push_back(&E.getTarget()); + } } } - // Collect atoms to remove, then remove them from the graph. - std::vector AtomsToRemove; - for (auto *DA : G.defined_atoms()) - if (DA->shouldDiscard() || !DA->isLive()) - AtomsToRemove.push_back(DA); - - LLVM_DEBUG(dbgs() << "Pruning atoms:\n"); - for (auto *DA : AtomsToRemove) { - LLVM_DEBUG(dbgs() << " " << *DA << "... "); - - // Check whether we need to replace this atom with an external atom. - // - // We replace if all of the following hold: - // (1) The atom is marked should-discard, - // (2) it has live edges (i.e. edges from live atoms) pointing to it. - // - // Otherwise we simply delete the atom. - - G.removeDefinedAtom(*DA); - - auto EdgesToUpdateItr = EdgesToUpdate.find(DA); - if (EdgesToUpdateItr != EdgesToUpdate.end()) { - auto &ExternalReplacement = G.addExternalAtom(DA->getName()); - for (auto *EdgeToUpdate : EdgesToUpdateItr->second) - EdgeToUpdate->setTarget(ExternalReplacement); - LLVM_DEBUG(dbgs() << "replaced with " << ExternalReplacement << "\n"); - } else - LLVM_DEBUG(dbgs() << "deleted\n"); + // Collect all the symbols to remove, then remove them. + { + LLVM_DEBUG(dbgs() << "Dead-stripping symbols:\n"); + std::vector SymbolsToRemove; + for (auto *Sym : G.defined_symbols()) + if (!Sym->isLive()) + SymbolsToRemove.push_back(Sym); + for (auto *Sym : SymbolsToRemove) { + LLVM_DEBUG(dbgs() << " " << *Sym << "...\n"); + G.removeDefinedSymbol(*Sym); + } } - // Finally, discard any absolute symbols that were marked should-discard. + // Delete any unused blocks. { - std::vector AbsoluteAtomsToRemove; - for (auto *A : G.absolute_atoms()) - if (A->shouldDiscard() || A->isLive()) - AbsoluteAtomsToRemove.push_back(A); - for (auto *A : AbsoluteAtomsToRemove) - G.removeAbsoluteAtom(*A); + LLVM_DEBUG(dbgs() << "Dead-stripping blocks:\n"); + std::vector BlocksToRemove; + for (auto *B : G.blocks()) + if (!VisitedBlocks.count(B)) + BlocksToRemove.push_back(B); + for (auto *B : BlocksToRemove) { + LLVM_DEBUG(dbgs() << " " << *B << "...\n"); + G.removeBlock(*B); + } } } diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h index eeb2527bd1b778..07dee6cee20027 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h @@ -41,39 +41,32 @@ class JITLinkerBase { protected: struct SegmentLayout { - using SectionAtomsList = std::vector; - struct SectionLayout { - SectionLayout(Section &S) : S(&S) {} + using BlocksList = std::vector; - Section *S; - SectionAtomsList Atoms; - }; - - using SectionLayoutList = std::vector; - - SectionLayoutList ContentSections; - SectionLayoutList ZeroFillSections; + BlocksList ContentBlocks; + BlocksList ZeroFillBlocks; }; using SegmentLayoutMap = DenseMap; // Phase 1: - // 1.1: Build atom graph + // 1.1: Build link graph // 1.2: Run pre-prune passes // 1.2: Prune graph // 1.3: Run post-prune passes - // 1.4: Sort atoms into segments + // 1.4: Sort blocks into segments // 1.5: Allocate segment memory // 1.6: Identify externals and make an async call to resolve function void linkPhase1(std::unique_ptr Self); // Phase 2: // 2.1: Apply resolution results - // 2.2: Fix up atom contents + // 2.2: Fix up block contents // 2.3: Call OnResolved callback // 2.3: Make an async call to transfer and finalize memory. void linkPhase2(std::unique_ptr Self, - Expected LookupResult); + Expected LookupResult, + SegmentLayoutMap Layout); // Phase 3: // 3.1: Call OnFinalized callback, handing off allocation. @@ -81,24 +74,37 @@ class JITLinkerBase { // Build a graph from the given object buffer. // To be implemented by the client. - virtual Expected> + virtual Expected> buildGraph(MemoryBufferRef ObjBuffer) = 0; - // For debug dumping of the atom graph. + // For debug dumping of the link graph. virtual StringRef getEdgeKindName(Edge::Kind K) const = 0; + // Alight a JITTargetAddress to conform with block alignment requirements. + static JITTargetAddress alignToBlock(JITTargetAddress Addr, Block &B) { + uint64_t Delta = (B.getAlignmentOffset() - Addr) % B.getAlignment(); + return Addr + Delta; + } + + // Alight a pointer to conform with block alignment requirements. + static char *alignToBlock(char *P, Block &B) { + uint64_t PAddr = static_cast(reinterpret_cast(P)); + uint64_t Delta = (B.getAlignmentOffset() - PAddr) % B.getAlignment(); + return P + Delta; + } + private: // Run all passes in the given pass list, bailing out immediately if any pass // returns an error. - Error runPasses(AtomGraphPassList &Passes, AtomGraph &G); + Error runPasses(LinkGraphPassList &Passes); - // Copy atom contents and apply relocations. + // Copy block contents and apply relocations. // Implemented in JITLinker. virtual Error - copyAndFixUpAllAtoms(const SegmentLayoutMap &Layout, - JITLinkMemoryManager::Allocation &Alloc) const = 0; + copyAndFixUpBlocks(const SegmentLayoutMap &Layout, + JITLinkMemoryManager::Allocation &Alloc) const = 0; - void layOutAtoms(); + SegmentLayoutMap layOutBlocks(); Error allocateSegments(const SegmentLayoutMap &Layout); DenseSet getExternalSymbolNames() const; void applyLookupResult(AsyncLookupResult LR); @@ -108,8 +114,7 @@ class JITLinkerBase { std::unique_ptr Ctx; PassConfiguration Passes; - std::unique_ptr G; - SegmentLayoutMap Layout; + std::unique_ptr G; std::unique_ptr Alloc; }; @@ -140,17 +145,17 @@ template class JITLinker : public JITLinkerBase { } Error - copyAndFixUpAllAtoms(const SegmentLayoutMap &Layout, - JITLinkMemoryManager::Allocation &Alloc) const override { - LLVM_DEBUG(dbgs() << "Copying and fixing up atoms:\n"); + copyAndFixUpBlocks(const SegmentLayoutMap &Layout, + JITLinkMemoryManager::Allocation &Alloc) const override { + LLVM_DEBUG(dbgs() << "Copying and fixing up blocks:\n"); for (auto &KV : Layout) { auto &Prot = KV.first; auto &SegLayout = KV.second; auto SegMem = Alloc.getWorkingMemory( static_cast(Prot)); - char *LastAtomEnd = SegMem.data(); - char *AtomDataPtr = LastAtomEnd; + char *LastBlockEnd = SegMem.data(); + char *BlockDataPtr = LastBlockEnd; LLVM_DEBUG({ dbgs() << " Processing segment " @@ -160,93 +165,79 @@ template class JITLinker : public JITLinkerBase { << " ]\n Processing content sections:\n"; }); - for (auto &SI : SegLayout.ContentSections) { - LLVM_DEBUG(dbgs() << " " << SI.S->getName() << ":\n"); + for (auto *B : SegLayout.ContentBlocks) { + LLVM_DEBUG(dbgs() << " " << *B << ":\n"); + + // Pad to alignment/alignment-offset. + BlockDataPtr = alignToBlock(BlockDataPtr, *B); - AtomDataPtr += alignmentAdjustment(AtomDataPtr, SI.S->getAlignment()); + LLVM_DEBUG({ + dbgs() << " Bumped block pointer to " + << (const void *)BlockDataPtr << " to meet block alignment " + << B->getAlignment() << " and alignment offset " + << B->getAlignmentOffset() << "\n"; + }); + // Zero pad up to alignment. LLVM_DEBUG({ - dbgs() << " Bumped atom pointer to " << (const void *)AtomDataPtr - << " to meet section alignment " - << " of " << SI.S->getAlignment() << "\n"; + if (LastBlockEnd != BlockDataPtr) + dbgs() << " Zero padding from " << (const void *)LastBlockEnd + << " to " << (const void *)BlockDataPtr << "\n"; }); - for (auto *DA : SI.Atoms) { - - // Align. - AtomDataPtr += alignmentAdjustment(AtomDataPtr, DA->getAlignment()); - LLVM_DEBUG({ - dbgs() << " Bumped atom pointer to " - << (const void *)AtomDataPtr << " to meet alignment of " - << DA->getAlignment() << "\n"; - }); - - // Zero pad up to alignment. - LLVM_DEBUG({ - if (LastAtomEnd != AtomDataPtr) - dbgs() << " Zero padding from " << (const void *)LastAtomEnd - << " to " << (const void *)AtomDataPtr << "\n"; - }); - while (LastAtomEnd != AtomDataPtr) - *LastAtomEnd++ = 0; - - // Copy initial atom content. - LLVM_DEBUG({ - dbgs() << " Copying atom " << *DA << " content, " - << DA->getContent().size() << " bytes, from " - << (const void *)DA->getContent().data() << " to " - << (const void *)AtomDataPtr << "\n"; - }); - memcpy(AtomDataPtr, DA->getContent().data(), DA->getContent().size()); - - // Copy atom data and apply fixups. - LLVM_DEBUG(dbgs() << " Applying fixups.\n"); - for (auto &E : DA->edges()) { - - // Skip non-relocation edges. - if (!E.isRelocation()) - continue; - - // Dispatch to LinkerImpl for fixup. - if (auto Err = impl().applyFixup(*DA, E, AtomDataPtr)) - return Err; - } - - // Point the atom's content to the fixed up buffer. - DA->setContent(StringRef(AtomDataPtr, DA->getContent().size())); - - // Update atom end pointer. - LastAtomEnd = AtomDataPtr + DA->getContent().size(); - AtomDataPtr = LastAtomEnd; + while (LastBlockEnd != BlockDataPtr) + *LastBlockEnd++ = 0; + + // Copy initial block content. + LLVM_DEBUG({ + dbgs() << " Copying block " << *B << " content, " + << B->getContent().size() << " bytes, from " + << (const void *)B->getContent().data() << " to " + << (const void *)BlockDataPtr << "\n"; + }); + memcpy(BlockDataPtr, B->getContent().data(), B->getContent().size()); + + // Copy Block data and apply fixups. + LLVM_DEBUG(dbgs() << " Applying fixups.\n"); + for (auto &E : B->edges()) { + + // Skip non-relocation edges. + if (!E.isRelocation()) + continue; + + // Dispatch to LinkerImpl for fixup. + if (auto Err = impl().applyFixup(*B, E, BlockDataPtr)) + return Err; } + + // Point the block's content to the fixed up buffer. + B->setContent(StringRef(BlockDataPtr, B->getContent().size())); + + // Update block end pointer. + LastBlockEnd = BlockDataPtr + B->getContent().size(); + BlockDataPtr = LastBlockEnd; } // Zero pad the rest of the segment. LLVM_DEBUG({ dbgs() << " Zero padding end of segment from " - << (const void *)LastAtomEnd << " to " + << (const void *)LastBlockEnd << " to " << (const void *)((char *)SegMem.data() + SegMem.size()) << "\n"; }); - while (LastAtomEnd != SegMem.data() + SegMem.size()) - *LastAtomEnd++ = 0; + while (LastBlockEnd != SegMem.data() + SegMem.size()) + *LastBlockEnd++ = 0; } return Error::success(); } }; -/// Dead strips and replaces discarded definitions with external atoms. +/// Removes dead symbols/blocks/addressables. /// -/// Finds the set of nodes reachable from any node initially marked live -/// (nodes marked should-discard are treated as not live, even if they are -/// reachable). All nodes not marked as live at the end of this process, -/// are deleted. Nodes that are live, but marked should-discard are replaced -/// with external atoms and all edges to them are re-written. -void prune(AtomGraph &G); - -Error addEHFrame(AtomGraph &G, Section &EHFrameSection, - StringRef EHFrameContent, JITTargetAddress EHFrameAddress, - Edge::Kind FDEToCIERelocKind, Edge::Kind FDEToTargetRelocKind); +/// Finds the set of symbols and addressables reachable from any symbol +/// initially marked live. All symbols/addressables not marked live at the end +/// of this process are removed. +void prune(LinkGraph &G); } // end namespace jitlink } // end namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp index 267307cfde05cb..ecc6793bbce946 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp @@ -61,6 +61,10 @@ InProcessMemoryManager::allocate(const SegmentsRequestMap &Request) { AllocationMap SegBlocks; }; + if (!isPowerOf2_64((uint64_t)sys::Process::getPageSizeEstimate())) + return make_error("Page size is not a power of 2", + inconvertibleErrorCode()); + AllocationMap Blocks; const sys::Memory::ProtectionFlags ReadWrite = static_cast(sys::Memory::MF_READ | @@ -69,19 +73,12 @@ InProcessMemoryManager::allocate(const SegmentsRequestMap &Request) { for (auto &KV : Request) { auto &Seg = KV.second; - if (Seg.getContentAlignment() > sys::Process::getPageSizeEstimate()) + if (Seg.getAlignment() > sys::Process::getPageSizeEstimate()) return make_error("Cannot request higher than page " "alignment", inconvertibleErrorCode()); - if (sys::Process::getPageSizeEstimate() % Seg.getContentAlignment() != 0) - return make_error("Page size is not a multiple of " - "alignment", - inconvertibleErrorCode()); - - uint64_t ZeroFillStart = - alignTo(Seg.getContentSize(), Seg.getZeroFillAlignment()); - uint64_t SegmentSize = ZeroFillStart + Seg.getZeroFillSize(); + uint64_t SegmentSize = Seg.getContentSize() + Seg.getZeroFillSize(); std::error_code EC; auto SegMem = @@ -91,7 +88,7 @@ InProcessMemoryManager::allocate(const SegmentsRequestMap &Request) { return errorCodeToError(EC); // Zero out the zero-fill memory. - memset(static_cast(SegMem.base()) + ZeroFillStart, 0, + memset(static_cast(SegMem.base()) + Seg.getContentSize(), 0, Seg.getZeroFillSize()); // Record the block for this segment. diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.cpp deleted file mode 100644 index c1040c942b27f3..00000000000000 --- a/llvm/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.cpp +++ /dev/null @@ -1,412 +0,0 @@ -//=--------- MachOAtomGraphBuilder.cpp - MachO AtomGraph builder ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Generic MachO AtomGraph buliding code. -// -//===----------------------------------------------------------------------===// - -#include "MachOAtomGraphBuilder.h" - -#define DEBUG_TYPE "jitlink" - -namespace llvm { -namespace jitlink { - -MachOAtomGraphBuilder::~MachOAtomGraphBuilder() {} - -Expected> MachOAtomGraphBuilder::buildGraph() { - if (auto Err = parseSections()) - return std::move(Err); - - if (auto Err = addAtoms()) - return std::move(Err); - - if (auto Err = addRelocations()) - return std::move(Err); - - return std::move(G); -} - -MachOAtomGraphBuilder::MachOAtomGraphBuilder(const object::MachOObjectFile &Obj) - : Obj(Obj), - G(std::make_unique(Obj.getFileName(), getPointerSize(Obj), - getEndianness(Obj))) {} - -void MachOAtomGraphBuilder::addCustomAtomizer(StringRef SectionName, - CustomAtomizeFunction Atomizer) { - assert(!CustomAtomizeFunctions.count(SectionName) && - "Custom atomizer for this section already exists"); - CustomAtomizeFunctions[SectionName] = std::move(Atomizer); -} - -bool MachOAtomGraphBuilder::areLayoutLocked(const Atom &A, const Atom &B) { - // If these atoms are the same then they're trivially "locked". - if (&A == &B) - return true; - - // If A and B are different, check whether either is undefined. (in which - // case they are not locked). - if (!A.isDefined() || !B.isDefined()) - return false; - - // A and B are different, but they're both defined atoms. We need to check - // whether they're part of the same alt_entry chain. - auto &DA = static_cast(A); - auto &DB = static_cast(B); - - auto AStartItr = AltEntryStarts.find(&DA); - if (AStartItr == AltEntryStarts.end()) // If A is not in a chain bail out. - return false; - - auto BStartItr = AltEntryStarts.find(&DB); - if (BStartItr == AltEntryStarts.end()) // If B is not in a chain bail out. - return false; - - // A and B are layout locked if they're in the same chain. - return AStartItr->second == BStartItr->second; -} - -unsigned -MachOAtomGraphBuilder::getPointerSize(const object::MachOObjectFile &Obj) { - return Obj.is64Bit() ? 8 : 4; -} - -support::endianness -MachOAtomGraphBuilder::getEndianness(const object::MachOObjectFile &Obj) { - return Obj.isLittleEndian() ? support::little : support::big; -} - -MachOAtomGraphBuilder::MachOSection &MachOAtomGraphBuilder::getCommonSection() { - if (!CommonSymbolsSection) { - auto Prot = static_cast( - sys::Memory::MF_READ | sys::Memory::MF_WRITE); - auto &GenericSection = G->createSection("", 1, Prot, true); - CommonSymbolsSection = MachOSection(GenericSection); - } - return *CommonSymbolsSection; -} - -Error MachOAtomGraphBuilder::parseSections() { - for (auto &SecRef : Obj.sections()) { - assert((SecRef.getAlignment() <= std::numeric_limits::max()) && - "Section alignment does not fit in 32 bits"); - - Expected NameOrErr = SecRef.getName(); - if (!NameOrErr) - return NameOrErr.takeError(); - StringRef Name = *NameOrErr; - - unsigned SectionIndex = SecRef.getIndex() + 1; - - uint32_t Align = SecRef.getAlignment(); - if (!isPowerOf2_32(Align)) - return make_error("Section " + Name + - " has non-power-of-2 " - "alignment"); - - // FIXME: Get real section permissions - // How, exactly, on MachO? - sys::Memory::ProtectionFlags Prot; - if (SecRef.isText()) - Prot = static_cast(sys::Memory::MF_READ | - sys::Memory::MF_EXEC); - else - Prot = static_cast(sys::Memory::MF_READ | - sys::Memory::MF_WRITE); - - auto &GenericSection = G->createSection(Name, Align, Prot, SecRef.isBSS()); - - LLVM_DEBUG({ - dbgs() << "Adding section " << Name << ": " - << format("0x%016" PRIx64, SecRef.getAddress()) - << ", align: " << SecRef.getAlignment() << "\n"; - }); - - assert(!Sections.count(SectionIndex) && "Section index already in use"); - - auto &MachOSec = - Sections - .try_emplace(SectionIndex, GenericSection, SecRef.getAddress(), - SecRef.getAlignment()) - .first->second; - - if (!SecRef.isVirtual()) { - // If this section has content then record it. - Expected Content = SecRef.getContents(); - if (!Content) - return Content.takeError(); - if (Content->size() != SecRef.getSize()) - return make_error("Section content size does not match " - "declared size for " + - Name); - MachOSec.setContent(*Content); - } else { - // If this is a zero-fill section then just record the size. - MachOSec.setZeroFill(SecRef.getSize()); - } - - uint32_t SectionFlags = - Obj.is64Bit() ? Obj.getSection64(SecRef.getRawDataRefImpl()).flags - : Obj.getSection(SecRef.getRawDataRefImpl()).flags; - - MachOSec.setNoDeadStrip(SectionFlags & MachO::S_ATTR_NO_DEAD_STRIP); - } - - return Error::success(); -} - -// Adds atoms with identified start addresses (but not lengths) for all named -// atoms. -// Also, for every section that contains named atoms, but does not have an -// atom at offset zero of that section, constructs an anonymous atom covering -// that range. -Error MachOAtomGraphBuilder::addNonCustomAtoms() { - using AddrToAtomMap = std::map; - DenseMap SecToAtoms; - - DenseMap FirstOrdinal; - std::vector AltEntryAtoms; - - DenseSet ProcessedSymbols; // Used to check for duplicate defs. - - for (auto SymI = Obj.symbol_begin(), SymE = Obj.symbol_end(); SymI != SymE; - ++SymI) { - object::SymbolRef Sym(SymI->getRawDataRefImpl(), &Obj); - - auto Name = Sym.getName(); - if (!Name) - return Name.takeError(); - - // Bail out on duplicate definitions: There should never be more than one - // definition for a symbol in a given object file. - if (ProcessedSymbols.count(*Name)) - return make_error("Duplicate definition within object: " + - *Name); - else - ProcessedSymbols.insert(*Name); - - auto Addr = Sym.getAddress(); - if (!Addr) - return Addr.takeError(); - - auto SymType = Sym.getType(); - if (!SymType) - return SymType.takeError(); - - auto Flags = Sym.getFlags(); - - if (Flags & object::SymbolRef::SF_Undefined) { - LLVM_DEBUG(dbgs() << "Adding undef atom \"" << *Name << "\"\n"); - G->addExternalAtom(*Name); - continue; - } else if (Flags & object::SymbolRef::SF_Absolute) { - LLVM_DEBUG(dbgs() << "Adding absolute \"" << *Name << "\" addr: " - << format("0x%016" PRIx64, *Addr) << "\n"); - auto &A = G->addAbsoluteAtom(*Name, *Addr); - A.setGlobal(Flags & object::SymbolRef::SF_Global); - A.setExported(Flags & object::SymbolRef::SF_Exported); - A.setWeak(Flags & object::SymbolRef::SF_Weak); - continue; - } else if (Flags & object::SymbolRef::SF_Common) { - LLVM_DEBUG({ - dbgs() << "Adding common \"" << *Name - << "\" addr: " << format("0x%016" PRIx64, *Addr) << "\n"; - }); - auto &A = - G->addCommonAtom(getCommonSection().getGenericSection(), *Name, *Addr, - std::max(Sym.getAlignment(), 1U), - Obj.getCommonSymbolSize(Sym.getRawDataRefImpl())); - A.setGlobal(Flags & object::SymbolRef::SF_Global); - A.setExported(Flags & object::SymbolRef::SF_Exported); - continue; - } - - LLVM_DEBUG(dbgs() << "Adding defined atom \"" << *Name << "\"\n"); - - // This atom is neither undefined nor absolute, so it must be defined in - // this object. Get its section index. - auto SecItr = Sym.getSection(); - if (!SecItr) - return SecItr.takeError(); - - uint64_t SectionIndex = (*SecItr)->getIndex() + 1; - - LLVM_DEBUG(dbgs() << " to section index " << SectionIndex << "\n"); - - auto SecByIndexItr = Sections.find(SectionIndex); - if (SecByIndexItr == Sections.end()) - return make_error("Unrecognized section index in macho"); - - auto &Sec = SecByIndexItr->second; - - auto &DA = G->addDefinedAtom(Sec.getGenericSection(), *Name, *Addr, - std::max(Sym.getAlignment(), 1U)); - - DA.setGlobal(Flags & object::SymbolRef::SF_Global); - DA.setExported(Flags & object::SymbolRef::SF_Exported); - DA.setWeak(Flags & object::SymbolRef::SF_Weak); - - DA.setCallable(*SymType & object::SymbolRef::ST_Function); - - // Check NDesc flags. - { - uint16_t NDesc = 0; - if (Obj.is64Bit()) - NDesc = Obj.getSymbol64TableEntry(SymI->getRawDataRefImpl()).n_desc; - else - NDesc = Obj.getSymbolTableEntry(SymI->getRawDataRefImpl()).n_desc; - - // Record atom for alt-entry post-processing (where the layout-next - // constraints will be added). - if (NDesc & MachO::N_ALT_ENTRY) - AltEntryAtoms.push_back(&DA); - - // If this atom has a no-dead-strip attr attached then mark it live. - if (NDesc & MachO::N_NO_DEAD_STRIP) - DA.setLive(true); - } - - LLVM_DEBUG({ - dbgs() << " Added " << *Name - << " addr: " << format("0x%016" PRIx64, *Addr) - << ", align: " << DA.getAlignment() - << ", section: " << Sec.getGenericSection().getName() << "\n"; - }); - - auto &SecAtoms = SecToAtoms[&Sec]; - SecAtoms[DA.getAddress() - Sec.getAddress()] = &DA; - } - - // Add anonymous atoms. - for (auto &KV : Sections) { - auto &S = KV.second; - - // Skip empty sections. - if (S.empty()) - continue; - - // Skip sections with custom handling. - if (CustomAtomizeFunctions.count(S.getName())) - continue; - - auto SAI = SecToAtoms.find(&S); - - // If S is not in the SecToAtoms map then it contained no named atom. Add - // one anonymous atom to cover the whole section. - if (SAI == SecToAtoms.end()) { - SecToAtoms[&S][0] = &G->addAnonymousAtom( - S.getGenericSection(), S.getAddress(), S.getAlignment()); - continue; - } - - // Otherwise, check whether this section had an atom covering offset zero. - // If not, add one. - auto &SecAtoms = SAI->second; - if (!SecAtoms.count(0)) - SecAtoms[0] = &G->addAnonymousAtom(S.getGenericSection(), S.getAddress(), - S.getAlignment()); - } - - LLVM_DEBUG(dbgs() << "MachOGraphBuilder setting atom content\n"); - - // Set atom contents and any section-based flags. - for (auto &KV : SecToAtoms) { - auto &S = *KV.first; - auto &SecAtoms = KV.second; - - // Iterate the atoms in reverse order and set up their contents. - JITTargetAddress LastAtomAddr = S.getSize(); - for (auto I = SecAtoms.rbegin(), E = SecAtoms.rend(); I != E; ++I) { - auto Offset = I->first; - auto &A = *I->second; - LLVM_DEBUG({ - dbgs() << " " << A << " to [ " << S.getAddress() + Offset << " .. " - << S.getAddress() + LastAtomAddr << " ]\n"; - }); - - if (S.isZeroFill()) - A.setZeroFill(LastAtomAddr - Offset); - else - A.setContent(S.getContent().substr(Offset, LastAtomAddr - Offset)); - - // If the section has no-dead-strip set then mark the atom as live. - if (S.isNoDeadStrip()) - A.setLive(true); - - LastAtomAddr = Offset; - } - } - - LLVM_DEBUG(dbgs() << "Adding alt-entry starts\n"); - - // Sort alt-entry atoms by address in ascending order. - llvm::sort(AltEntryAtoms.begin(), AltEntryAtoms.end(), - [](const DefinedAtom *LHS, const DefinedAtom *RHS) { - return LHS->getAddress() < RHS->getAddress(); - }); - - // Process alt-entry atoms in address order to build the table of alt-entry - // atoms to alt-entry chain starts. - for (auto *DA : AltEntryAtoms) { - assert(!AltEntryStarts.count(DA) && "Duplicate entry in AltEntryStarts"); - - // DA is an alt-entry atom. Look for the predecessor atom that it is locked - // to, bailing out if we do not find one. - auto AltEntryPred = G->findAtomByAddress(DA->getAddress() - 1); - if (!AltEntryPred) - return AltEntryPred.takeError(); - - // Add a LayoutNext edge from the predecessor to this atom. - AltEntryPred->setLayoutNext(*DA); - - // Check to see whether the predecessor itself is an alt-entry atom. - auto AltEntryStartItr = AltEntryStarts.find(&*AltEntryPred); - if (AltEntryStartItr != AltEntryStarts.end()) { - // If the predecessor was an alt-entry atom then re-use its value. - LLVM_DEBUG({ - dbgs() << " " << *DA << " -> " << *AltEntryStartItr->second - << " (based on existing entry for " << *AltEntryPred << ")\n"; - }); - AltEntryStarts[DA] = AltEntryStartItr->second; - } else { - // If the predecessor does not have an entry then add an entry for this - // atom (i.e. the alt_entry atom) and a self-reference entry for the - /// predecessory atom that is the start of this chain. - LLVM_DEBUG({ - dbgs() << " " << *AltEntryPred << " -> " << *AltEntryPred << "\n" - << " " << *DA << " -> " << *AltEntryPred << "\n"; - }); - AltEntryStarts[&*AltEntryPred] = &*AltEntryPred; - AltEntryStarts[DA] = &*AltEntryPred; - } - } - - return Error::success(); -} - -Error MachOAtomGraphBuilder::addAtoms() { - // Add all named atoms. - if (auto Err = addNonCustomAtoms()) - return Err; - - // Process special sections. - for (auto &KV : Sections) { - auto &S = KV.second; - auto HI = CustomAtomizeFunctions.find(S.getGenericSection().getName()); - if (HI != CustomAtomizeFunctions.end()) { - auto &Atomize = HI->second; - if (auto Err = Atomize(S)) - return Err; - } - } - - return Error::success(); -} - -} // end namespace jitlink -} // end namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.h deleted file mode 100644 index 72d441b24d065c..00000000000000 --- a/llvm/lib/ExecutionEngine/JITLink/MachOAtomGraphBuilder.h +++ /dev/null @@ -1,138 +0,0 @@ -//===----- MachOAtomGraphBuilder.h - MachO AtomGraph builder ----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Generic MachO AtomGraph building code. -// -//===----------------------------------------------------------------------===// - -#ifndef LIB_EXECUTIONENGINE_JITLINK_MACHOATOMGRAPHBUILDER_H -#define LIB_EXECUTIONENGINE_JITLINK_MACHOATOMGRAPHBUILDER_H - -#include "llvm/ExecutionEngine/JITLink/JITLink.h" - -#include "JITLinkGeneric.h" - -#include "llvm/Object/MachO.h" - -namespace llvm { -namespace jitlink { - -class MachOAtomGraphBuilder { -public: - virtual ~MachOAtomGraphBuilder(); - Expected> buildGraph(); - -protected: - using OffsetToAtomMap = std::map; - - class MachOSection { - public: - MachOSection() = default; - - /// Create a MachO section with the given address and alignment. - MachOSection(Section &GenericSection, JITTargetAddress Address, - unsigned Alignment) - : Address(Address), GenericSection(&GenericSection), - Alignment(Alignment) {} - - /// Create a section without address, content or size (used for common - /// symbol sections). - MachOSection(Section &GenericSection) : GenericSection(&GenericSection) {} - - Section &getGenericSection() const { - assert(GenericSection && "Section is null"); - return *GenericSection; - } - - StringRef getName() const { - assert(GenericSection && "No generic section attached"); - return GenericSection->getName(); - } - - MachOSection &setContent(StringRef Content) { - assert(!ContentPtr && !Size && "Content/zeroFill already set"); - ContentPtr = Content.data(); - Size = Content.size(); - return *this; - } - - MachOSection &setZeroFill(uint64_t Size) { - assert(!ContentPtr && !this->Size && "Content/zeroFill already set"); - this->Size = Size; - return *this; - } - - bool isZeroFill() const { return !ContentPtr; } - - bool empty() const { return getSize() == 0; } - - size_t getSize() const { return Size; } - - StringRef getContent() const { - assert(ContentPtr && "getContent() called on zero-fill section"); - return {ContentPtr, static_cast(Size)}; - } - - JITTargetAddress getAddress() const { return Address; } - - unsigned getAlignment() const { return Alignment; } - - MachOSection &setNoDeadStrip(bool NoDeadStrip) { - this->NoDeadStrip = NoDeadStrip; - return *this; - } - - bool isNoDeadStrip() const { return NoDeadStrip; } - - private: - JITTargetAddress Address = 0; - Section *GenericSection = nullptr; - const char *ContentPtr = nullptr; - uint64_t Size = 0; - unsigned Alignment = 0; - bool NoDeadStrip = false; - }; - - using CustomAtomizeFunction = std::function; - - MachOAtomGraphBuilder(const object::MachOObjectFile &Obj); - - AtomGraph &getGraph() const { return *G; } - - const object::MachOObjectFile &getObject() const { return Obj; } - - void addCustomAtomizer(StringRef SectionName, CustomAtomizeFunction Atomizer); - - virtual Error addRelocations() = 0; - - /// Returns true if Atom A and Atom B are at a fixed offset from one another - /// (i.e. if they're part of the same alt-entry chain). - bool areLayoutLocked(const Atom &A, const Atom &B); - -private: - static unsigned getPointerSize(const object::MachOObjectFile &Obj); - static support::endianness getEndianness(const object::MachOObjectFile &Obj); - - MachOSection &getCommonSection(); - - Error parseSections(); - Error addNonCustomAtoms(); - Error addAtoms(); - - const object::MachOObjectFile &Obj; - std::unique_ptr G; - DenseMap AltEntryStarts; - DenseMap Sections; - StringMap CustomAtomizeFunctions; - Optional CommonSymbolsSection; -}; - -} // end namespace jitlink -} // end namespace llvm - -#endif // LIB_EXECUTIONENGINE_JITLINK_MACHOATOMGRAPHBUILDER_H diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp new file mode 100644 index 00000000000000..7366f53ebf36bf --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp @@ -0,0 +1,535 @@ +//=--------- MachOLinkGraphBuilder.cpp - MachO LinkGraph builder ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic MachO LinkGraph buliding code. +// +//===----------------------------------------------------------------------===// + +#include "MachOLinkGraphBuilder.h" + +#define DEBUG_TYPE "jitlink" + +static const char *CommonSectionName = "__common"; + +namespace llvm { +namespace jitlink { + +MachOLinkGraphBuilder::~MachOLinkGraphBuilder() {} + +Expected> MachOLinkGraphBuilder::buildGraph() { + + // Sanity check: we only operate on relocatable objects. + if (!Obj.isRelocatableObject()) + return make_error("Object is not a relocatable MachO"); + + if (auto Err = createNormalizedSections()) + return std::move(Err); + + if (auto Err = createNormalizedSymbols()) + return std::move(Err); + + if (auto Err = graphifyRegularSymbols()) + return std::move(Err); + + if (auto Err = graphifySectionsWithCustomParsers()) + return std::move(Err); + + if (auto Err = addRelocations()) + return std::move(Err); + + return std::move(G); +} + +MachOLinkGraphBuilder::MachOLinkGraphBuilder(const object::MachOObjectFile &Obj) + : Obj(Obj), + G(std::make_unique(Obj.getFileName(), getPointerSize(Obj), + getEndianness(Obj))) {} + +void MachOLinkGraphBuilder::addCustomSectionParser( + StringRef SectionName, SectionParserFunction Parser) { + assert(!CustomSectionParserFunctions.count(SectionName) && + "Custom parser for this section already exists"); + CustomSectionParserFunctions[SectionName] = std::move(Parser); +} + +Linkage MachOLinkGraphBuilder::getLinkage(uint16_t Desc) { + if ((Desc & MachO::N_WEAK_DEF) || (Desc & MachO::N_WEAK_REF)) + return Linkage::Weak; + return Linkage::Strong; +} + +Scope MachOLinkGraphBuilder::getScope(StringRef Name, uint8_t Type) { + if (Name.startswith("l")) + return Scope::Local; + if (Type & MachO::N_PEXT) + return Scope::Hidden; + if (Type & MachO::N_EXT) + return Scope::Default; + return Scope::Local; +} + +bool MachOLinkGraphBuilder::isAltEntry(const NormalizedSymbol &NSym) { + return NSym.Desc & MachO::N_ALT_ENTRY; +} + +unsigned +MachOLinkGraphBuilder::getPointerSize(const object::MachOObjectFile &Obj) { + return Obj.is64Bit() ? 8 : 4; +} + +support::endianness +MachOLinkGraphBuilder::getEndianness(const object::MachOObjectFile &Obj) { + return Obj.isLittleEndian() ? support::little : support::big; +} + +Section &MachOLinkGraphBuilder::getCommonSection() { + if (!CommonSection) { + auto Prot = static_cast( + sys::Memory::MF_READ | sys::Memory::MF_WRITE); + CommonSection = &G->createSection(CommonSectionName, Prot); + } + return *CommonSection; +} + +Error MachOLinkGraphBuilder::createNormalizedSections() { + // Build normalized sections. Verifies that section data is in-range (for + // sections with content) and that address ranges are non-overlapping. + + LLVM_DEBUG(dbgs() << "Creating normalized sections...\n"); + + for (auto &SecRef : Obj.sections()) { + NormalizedSection NSec; + uint32_t DataOffset = 0; + + auto SecIndex = Obj.getSectionIndex(SecRef.getRawDataRefImpl()); + + auto Name = SecRef.getName(); + if (!Name) + return Name.takeError(); + + if (Obj.is64Bit()) { + const MachO::section_64 &Sec64 = + Obj.getSection64(SecRef.getRawDataRefImpl()); + + NSec.Address = Sec64.addr; + NSec.Size = Sec64.size; + NSec.Alignment = 1ULL << Sec64.align; + NSec.Flags = Sec64.flags; + DataOffset = Sec64.offset; + } else { + const MachO::section &Sec32 = Obj.getSection(SecRef.getRawDataRefImpl()); + NSec.Address = Sec32.addr; + NSec.Size = Sec32.size; + NSec.Alignment = 1ULL << Sec32.align; + NSec.Flags = Sec32.flags; + DataOffset = Sec32.offset; + } + + LLVM_DEBUG({ + dbgs() << " " << *Name << ": " << formatv("{0:x16}", NSec.Address) + << " -- " << formatv("{0:x16}", NSec.Address + NSec.Size) + << ", align: " << NSec.Alignment << ", index: " << SecIndex + << "\n"; + }); + + // Get the section data if any. + { + unsigned SectionType = NSec.Flags & MachO::SECTION_TYPE; + if (SectionType != MachO::S_ZEROFILL && + SectionType != MachO::S_GB_ZEROFILL) { + + if (DataOffset + NSec.Size > Obj.getData().size()) + return make_error( + "Section data extends past end of file"); + + NSec.Data = Obj.getData().data() + DataOffset; + } + } + + // Get prot flags. + // FIXME: Make sure this test is correct (it's probably missing cases + // as-is). + sys::Memory::ProtectionFlags Prot; + if (NSec.Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) + Prot = static_cast(sys::Memory::MF_READ | + sys::Memory::MF_EXEC); + else + Prot = static_cast(sys::Memory::MF_READ | + sys::Memory::MF_WRITE); + + NSec.GraphSection = &G->createSection(*Name, Prot); + IndexToSection.insert(std::make_pair(SecIndex, std::move(NSec))); + } + + std::vector Sections; + Sections.reserve(IndexToSection.size()); + for (auto &KV : IndexToSection) + Sections.push_back(&KV.second); + + // If we didn't end up creating any sections then bail out. The code below + // assumes that we have at least one section. + if (Sections.empty()) + return Error::success(); + + llvm::sort(Sections, + [](const NormalizedSection *LHS, const NormalizedSection *RHS) { + assert(LHS && RHS && "Null section?"); + return LHS->Address < RHS->Address; + }); + + for (unsigned I = 0, E = Sections.size() - 1; I != E; ++I) { + auto &Cur = *Sections[I]; + auto &Next = *Sections[I + 1]; + if (Next.Address < Cur.Address + Cur.Size) + return make_error( + "Address range for section " + Cur.GraphSection->getName() + + formatv(" [ {0:x16} -- {1:x16} ] ", Cur.Address, + Cur.Address + Cur.Size) + + "overlaps " + + formatv(" [ {0:x16} -- {1:x16} ] ", Next.Address, + Next.Address + Next.Size)); + } + + return Error::success(); +} + +Error MachOLinkGraphBuilder::createNormalizedSymbols() { + LLVM_DEBUG(dbgs() << "Creating normalized symbols...\n"); + + for (auto &SymRef : Obj.symbols()) { + + unsigned SymbolIndex = Obj.getSymbolIndex(SymRef.getRawDataRefImpl()); + uint64_t Value; + uint32_t NStrX; + uint8_t Type; + uint8_t Sect; + uint16_t Desc; + + if (Obj.is64Bit()) { + const MachO::nlist_64 &NL64 = + Obj.getSymbol64TableEntry(SymRef.getRawDataRefImpl()); + Value = NL64.n_value; + NStrX = NL64.n_strx; + Type = NL64.n_type; + Sect = NL64.n_sect; + Desc = NL64.n_desc; + } else { + const MachO::nlist &NL32 = + Obj.getSymbolTableEntry(SymRef.getRawDataRefImpl()); + Value = NL32.n_value; + NStrX = NL32.n_strx; + Type = NL32.n_type; + Sect = NL32.n_sect; + Desc = NL32.n_desc; + } + + // Skip stabs. + // FIXME: Are there other symbols we should be skipping? + if (Type & MachO::N_STAB) + continue; + + Optional Name; + if (NStrX) { + if (auto NameOrErr = SymRef.getName()) + Name = *NameOrErr; + else + return NameOrErr.takeError(); + } + + LLVM_DEBUG({ + dbgs() << " "; + if (!Name) + dbgs() << ""; + else + dbgs() << *Name; + dbgs() << ": value = " << formatv("{0:x16}", Value) + << ", type = " << formatv("{0:x2}", Type) + << ", desc = " << formatv("{0:x4}", Desc) << ", sect = "; + if (Sect) + dbgs() << static_cast(Sect - 1); + else + dbgs() << "none"; + dbgs() << "\n"; + }); + + // If this symbol has a section, sanity check that the addresses line up. + NormalizedSection *NSec = nullptr; + if (Sect != 0) { + if (auto NSecOrErr = findSectionByIndex(Sect - 1)) + NSec = &*NSecOrErr; + else + return NSecOrErr.takeError(); + + if (Value < NSec->Address || Value > NSec->Address + NSec->Size) + return make_error("Symbol address does not fall within " + "section"); + } + + IndexToSymbol[SymbolIndex] = + &createNormalizedSymbol(*Name, Value, Type, Sect, Desc, + getLinkage(Type), getScope(*Name, Type)); + } + + return Error::success(); +} + +void MachOLinkGraphBuilder::addSectionStartSymAndBlock( + Section &GraphSec, uint64_t Address, const char *Data, uint64_t Size, + uint32_t Alignment, bool IsLive) { + Block &B = + Data ? G->createContentBlock(GraphSec, StringRef(Data, Size), Address, + Alignment, 0) + : G->createZeroFillBlock(GraphSec, Size, Address, Alignment, 0); + auto &Sym = G->addAnonymousSymbol(B, 0, Size, false, IsLive); + assert(!AddrToCanonicalSymbol.count(Sym.getAddress()) && + "Anonymous block start symbol clashes with existing symbol address"); + AddrToCanonicalSymbol[Sym.getAddress()] = &Sym; +} + +Error MachOLinkGraphBuilder::graphifyRegularSymbols() { + + LLVM_DEBUG(dbgs() << "Creating graph symbols...\n"); + + /// We only have 256 section indexes: Use a vector rather than a map. + std::vector> SecIndexToSymbols; + SecIndexToSymbols.resize(256); + + // Create commons, externs, and absolutes, and partition all other symbols by + // section. + for (auto &KV : IndexToSymbol) { + auto &NSym = *KV.second; + + switch (NSym.Type & MachO::N_TYPE) { + case MachO::N_UNDF: + if (NSym.Value) { + if (!NSym.Name) + return make_error("Anonymous common symbol at index " + + Twine(KV.first)); + NSym.GraphSymbol = &G->addCommonSymbol( + *NSym.Name, NSym.S, getCommonSection(), NSym.Value, 0, + 1ull << MachO::GET_COMM_ALIGN(NSym.Desc), + NSym.Desc & MachO::N_NO_DEAD_STRIP); + } else { + if (!NSym.Name) + return make_error("Anonymous external symbol at " + "index " + + Twine(KV.first)); + NSym.GraphSymbol = &G->addExternalSymbol(*NSym.Name, 0); + } + break; + case MachO::N_ABS: + if (!NSym.Name) + return make_error("Anonymous absolute symbol at index " + + Twine(KV.first)); + NSym.GraphSymbol = &G->addAbsoluteSymbol( + *NSym.Name, NSym.Value, 0, Linkage::Strong, Scope::Default, + NSym.Desc & MachO::N_NO_DEAD_STRIP); + break; + case MachO::N_SECT: + SecIndexToSymbols[NSym.Sect - 1].push_back(&NSym); + break; + case MachO::N_PBUD: + return make_error( + "Unupported N_PBUD symbol " + + (NSym.Name ? ("\"" + *NSym.Name + "\"") : Twine("")) + + " at index " + Twine(KV.first)); + case MachO::N_INDR: + return make_error( + "Unupported N_INDR symbol " + + (NSym.Name ? ("\"" + *NSym.Name + "\"") : Twine("")) + + " at index " + Twine(KV.first)); + default: + return make_error( + "Unrecognized symbol type " + Twine(NSym.Type & MachO::N_TYPE) + + " for symbol " + + (NSym.Name ? ("\"" + *NSym.Name + "\"") : Twine("")) + + " at index " + Twine(KV.first)); + } + } + + // Loop over sections performing regular graphification for those that + // don't have custom parsers. + for (auto &KV : IndexToSection) { + auto SecIndex = KV.first; + auto &NSec = KV.second; + + // Skip sections with custom parsers. + if (CustomSectionParserFunctions.count(NSec.GraphSection->getName())) { + LLVM_DEBUG({ + dbgs() << " Skipping section " << NSec.GraphSection->getName() + << " as it has a custom parser.\n"; + }); + continue; + } else + LLVM_DEBUG({ + dbgs() << " Processing section " << NSec.GraphSection->getName() + << "...\n"; + }); + + bool SectionIsNoDeadStrip = NSec.Flags & MachO::S_ATTR_NO_DEAD_STRIP; + bool SectionIsText = NSec.Flags & MachO::S_ATTR_PURE_INSTRUCTIONS; + + auto &SecNSymStack = SecIndexToSymbols[SecIndex]; + + // If this section is non-empty but there are no symbols covering it then + // create one block and anonymous symbol to cover the entire section. + if (SecNSymStack.empty()) { + if (NSec.Size > 0) { + LLVM_DEBUG({ + dbgs() << " Section non-empty, but contains no symbols. " + "Creating anonymous block to cover " + << formatv("{0:x16}", NSec.Address) << " -- " + << formatv("{0:x16}", NSec.Address + NSec.Size) << "\n"; + }); + addSectionStartSymAndBlock(*NSec.GraphSection, NSec.Address, NSec.Data, + NSec.Size, NSec.Alignment, + SectionIsNoDeadStrip); + } else + LLVM_DEBUG({ + dbgs() << " Section empty and contains no symbols. Skipping.\n"; + }); + continue; + } + + // Sort the symbol stack in by address, alt-entry status, scope, and name. + // We sort in reverse order so that symbols will be visited in the right + // order when we pop off the stack below. + llvm::sort(SecNSymStack, [](const NormalizedSymbol *LHS, + const NormalizedSymbol *RHS) { + if (LHS->Value != RHS->Value) + return LHS->Value > RHS->Value; + if (isAltEntry(*LHS) != isAltEntry(*RHS)) + return isAltEntry(*RHS); + if (LHS->S != RHS->S) + return static_cast(LHS->S) < static_cast(RHS->S); + return LHS->Name < RHS->Name; + }); + + // The first symbol in a section can not be an alt-entry symbol. + if (!SecNSymStack.empty() && isAltEntry(*SecNSymStack.back())) + return make_error( + "First symbol in " + NSec.GraphSection->getName() + " is alt-entry"); + + // If the section is non-empty but there is no symbol covering the start + // address then add an anonymous one. + if (SecNSymStack.back()->Value != NSec.Address) { + auto AnonBlockSize = SecNSymStack.back()->Value - NSec.Address; + LLVM_DEBUG({ + dbgs() << " Section start not covered by symbol. " + << "Creating anonymous block to cover [ " + << formatv("{0:x16}", NSec.Address) << " -- " + << formatv("{0:x16}", NSec.Address + AnonBlockSize) << " ]\n"; + }); + addSectionStartSymAndBlock(*NSec.GraphSection, NSec.Address, NSec.Data, + AnonBlockSize, NSec.Alignment, + SectionIsNoDeadStrip); + } + + // Visit section symbols in order by popping off the reverse-sorted stack, + // building blocks for each alt-entry chain and creating symbols as we go. + while (!SecNSymStack.empty()) { + SmallVector BlockSyms; + + BlockSyms.push_back(SecNSymStack.back()); + SecNSymStack.pop_back(); + while (!SecNSymStack.empty() && + (isAltEntry(*SecNSymStack.back()) || + SecNSymStack.back()->Value == BlockSyms.back()->Value)) { + BlockSyms.push_back(SecNSymStack.back()); + SecNSymStack.pop_back(); + } + + // BlockNSyms now contains the block symbols in reverse canonical order. + JITTargetAddress BlockStart = BlockSyms.front()->Value; + JITTargetAddress BlockEnd = SecNSymStack.empty() + ? NSec.Address + NSec.Size + : SecNSymStack.back()->Value; + JITTargetAddress BlockOffset = BlockStart - NSec.Address; + JITTargetAddress BlockSize = BlockEnd - BlockStart; + + LLVM_DEBUG({ + dbgs() << " Creating block for " << formatv("{0:x16}", BlockStart) + << " -- " << formatv("{0:x16}", BlockEnd) << ": " + << NSec.GraphSection->getName() << " + " + << formatv("{0:x16}", BlockOffset) << " with " + << BlockSyms.size() << " symbol(s)...\n"; + }); + + Block &B = + NSec.Data + ? G->createContentBlock( + *NSec.GraphSection, + StringRef(NSec.Data + BlockOffset, BlockSize), BlockStart, + NSec.Alignment, BlockStart % NSec.Alignment) + : G->createZeroFillBlock(*NSec.GraphSection, BlockSize, + BlockStart, NSec.Alignment, + BlockStart % NSec.Alignment); + + Optional LastCanonicalAddr; + JITTargetAddress SymEnd = BlockEnd; + while (!BlockSyms.empty()) { + auto &NSym = *BlockSyms.back(); + BlockSyms.pop_back(); + + bool SymLive = + (NSym.Desc & MachO::N_NO_DEAD_STRIP) || SectionIsNoDeadStrip; + + LLVM_DEBUG({ + dbgs() << " " << formatv("{0:x16}", NSym.Value) << " -- " + << formatv("{0:x16}", SymEnd) << ": "; + if (!NSym.Name) + dbgs() << ""; + else + dbgs() << NSym.Name; + if (SymLive) + dbgs() << " [no-dead-strip]"; + if (LastCanonicalAddr == NSym.Value) + dbgs() << " [non-canonical]"; + dbgs() << "\n"; + }); + + auto &Sym = + NSym.Name + ? G->addDefinedSymbol(B, NSym.Value - BlockStart, *NSym.Name, + SymEnd - NSym.Value, NSym.L, NSym.S, + SectionIsText, SymLive) + : G->addAnonymousSymbol(B, NSym.Value - BlockStart, + SymEnd - NSym.Value, SectionIsText, + SymLive); + NSym.GraphSymbol = &Sym; + if (LastCanonicalAddr != Sym.getAddress()) { + if (LastCanonicalAddr) + SymEnd = *LastCanonicalAddr; + LastCanonicalAddr = Sym.getAddress(); + setCanonicalSymbol(Sym); + } + } + } + } + + return Error::success(); +} + +Error MachOLinkGraphBuilder::graphifySectionsWithCustomParsers() { + // Graphify special sections. + for (auto &KV : IndexToSection) { + auto &NSec = KV.second; + + auto HI = CustomSectionParserFunctions.find(NSec.GraphSection->getName()); + if (HI != CustomSectionParserFunctions.end()) { + auto &Parse = HI->second; + if (auto Err = Parse(NSec)) + return Err; + } + } + + return Error::success(); +} + +} // end namespace jitlink +} // end namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h new file mode 100644 index 00000000000000..e1123cd1104876 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h @@ -0,0 +1,269 @@ +//===----- MachOLinkGraphBuilder.h - MachO LinkGraph builder ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic MachO LinkGraph building code. +// +//===----------------------------------------------------------------------===// + +#ifndef LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H +#define LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" + +#include "EHFrameSupportImpl.h" +#include "JITLinkGeneric.h" +#include "llvm/Object/MachO.h" + +#include + +namespace llvm { +namespace jitlink { + +class MachOLinkGraphBuilder { +public: + virtual ~MachOLinkGraphBuilder(); + Expected> buildGraph(); + +protected: + class MachOEHFrameBinaryParser : public EHFrameBinaryParser { + public: + MachOEHFrameBinaryParser(MachOLinkGraphBuilder &Builder, + JITTargetAddress EHFrameAddress, + StringRef EHFrameContent, Section &EHFrameSection, + uint64_t CIEAlignment, uint64_t FDEAlignment, + Edge::Kind FDEToCIERelocKind, + Edge::Kind FDEToTargetRelocKind) + : EHFrameBinaryParser(EHFrameAddress, EHFrameContent, + Builder.getGraph().getPointerSize(), + Builder.getGraph().getEndianness()), + Builder(Builder), EHFrameSection(EHFrameSection), + CIEAlignment(CIEAlignment), FDEAlignment(FDEAlignment), + FDEToCIERelocKind(FDEToCIERelocKind), + FDEToTargetRelocKind(FDEToTargetRelocKind) {} + + Symbol *getSymbolAtAddress(JITTargetAddress Address) override { + if (auto *Sym = Builder.getSymbolByAddress(Address)) + if (Sym->getAddress() == Address) + return Sym; + return nullptr; + } + + Symbol &createCIERecord(JITTargetAddress RecordAddr, + StringRef RecordContent) override { + auto &G = Builder.getGraph(); + auto &B = G.createContentBlock(EHFrameSection, RecordContent, RecordAddr, + CIEAlignment, 0); + auto &CIESymbol = + G.addAnonymousSymbol(B, 0, RecordContent.size(), false, false); + Builder.setCanonicalSymbol(CIESymbol); + return CIESymbol; + } + + Expected createFDERecord(JITTargetAddress RecordAddr, + StringRef RecordContent, Symbol &CIE, + size_t CIEOffset, Symbol &Func, + size_t FuncOffset, Symbol *LSDA, + size_t LSDAOffset) override { + auto &G = Builder.getGraph(); + auto &B = G.createContentBlock(EHFrameSection, RecordContent, RecordAddr, + FDEAlignment, 0); + + // Add edges to CIE, Func, and (conditionally) LSDA. + B.addEdge(FDEToCIERelocKind, CIEOffset, CIE, 0); + B.addEdge(FDEToTargetRelocKind, FuncOffset, Func, 0); + + if (LSDA) + B.addEdge(FDEToTargetRelocKind, LSDAOffset, *LSDA, 0); + + auto &FDESymbol = + G.addAnonymousSymbol(B, 0, RecordContent.size(), false, false); + + // Add a keep-alive relocation from the function to the FDE to ensure it + // is not dead stripped. + Func.getBlock().addEdge(Edge::KeepAlive, 0, FDESymbol, 0); + + return FDESymbol; + } + + private: + MachOLinkGraphBuilder &Builder; + Section &EHFrameSection; + uint64_t CIEAlignment; + uint64_t FDEAlignment; + Edge::Kind FDEToCIERelocKind; + Edge::Kind FDEToTargetRelocKind; + }; + + struct NormalizedSymbol { + friend class MachOLinkGraphBuilder; + + private: + NormalizedSymbol(Optional Name, uint64_t Value, uint8_t Type, + uint8_t Sect, uint16_t Desc, Linkage L, Scope S) + : Name(Name), Value(Value), Type(Type), Sect(Sect), Desc(Desc), L(L), + S(S) { + assert((!Name || !Name->empty()) && "Name must be none or non-empty"); + } + + public: + NormalizedSymbol(const NormalizedSymbol &) = delete; + NormalizedSymbol &operator=(const NormalizedSymbol &) = delete; + NormalizedSymbol(NormalizedSymbol &&) = delete; + NormalizedSymbol &operator=(NormalizedSymbol &&) = delete; + + Optional Name; + uint64_t Value = 0; + uint8_t Type = 0; + uint8_t Sect = 0; + uint16_t Desc = 0; + Linkage L = Linkage::Strong; + Scope S = Scope::Default; + Symbol *GraphSymbol = nullptr; + }; + + class NormalizedSection { + friend class MachOLinkGraphBuilder; + + private: + NormalizedSection() = default; + + public: + Section *GraphSection = nullptr; + uint64_t Address = 0; + uint64_t Size = 0; + uint64_t Alignment = 0; + uint32_t Flags = 0; + const char *Data = nullptr; + }; + + using SectionParserFunction = std::function; + + MachOLinkGraphBuilder(const object::MachOObjectFile &Obj); + + LinkGraph &getGraph() const { return *G; } + + const object::MachOObjectFile &getObject() const { return Obj; } + + void addCustomSectionParser(StringRef SectionName, + SectionParserFunction Parse); + + virtual Error addRelocations() = 0; + + /// Create a symbol. + template + NormalizedSymbol &createNormalizedSymbol(ArgTs &&... Args) { + NormalizedSymbol *Sym = reinterpret_cast( + Allocator.Allocate()); + new (Sym) NormalizedSymbol(std::forward(Args)...); + return *Sym; + } + + /// Index is zero-based (MachO section indexes are usually one-based) and + /// assumed to be in-range. Client is responsible for checking. + NormalizedSection &getSectionByIndex(unsigned Index) { + auto I = IndexToSection.find(Index); + assert(I != IndexToSection.end() && "No section recorded at index"); + return I->second; + } + + /// Try to get the section at the given index. Will return an error if the + /// given index is out of range, or if no section has been added for the given + /// index. + Expected findSectionByIndex(unsigned Index) { + auto I = IndexToSection.find(Index); + if (I == IndexToSection.end()) + return make_error("No section recorded for index " + + formatv("{0:u}", Index)); + return I->second; + } + + /// Try to get the symbol at the given index. Will return an error if the + /// given index is out of range, or if no symbol has been added for the given + /// index. + Expected findSymbolByIndex(uint64_t Index) { + if (Index >= IndexToSymbol.size()) + return make_error("Symbol index out of range"); + auto *Sym = IndexToSymbol[Index]; + if (!Sym) + return make_error("No symbol at index " + + formatv("{0:u}", Index)); + return *Sym; + } + + /// Returns the symbol with the highest address not greater than the search + /// address, or null if no such symbol exists. + Symbol *getSymbolByAddress(JITTargetAddress Address) { + auto I = AddrToCanonicalSymbol.upper_bound(Address); + if (I == AddrToCanonicalSymbol.begin()) + return nullptr; + return std::prev(I)->second; + } + + /// Returns the symbol with the highest address not greater than the search + /// address, or an error if no such symbol exists. + Expected findSymbolByAddress(JITTargetAddress Address) { + auto *Sym = getSymbolByAddress(Address); + if (Sym) + if (Address < Sym->getAddress() + Sym->getSize()) + return *Sym; + return make_error("No symbol covering address " + + formatv("{0:x16}", Address)); + } + + static Linkage getLinkage(uint16_t Desc); + static Scope getScope(StringRef Name, uint8_t Type); + static bool isAltEntry(const NormalizedSymbol &NSym); + +private: + static unsigned getPointerSize(const object::MachOObjectFile &Obj); + static support::endianness getEndianness(const object::MachOObjectFile &Obj); + + void setCanonicalSymbol(Symbol &Sym) { + auto *&CanonicalSymEntry = AddrToCanonicalSymbol[Sym.getAddress()]; + // There should be no symbol at this address, or, if there is, + // it should be a zero-sized symbol from an empty section (which + // we can safely override). + assert((!CanonicalSymEntry || CanonicalSymEntry->getSize() == 0) && + "Duplicate canonical symbol at address"); + CanonicalSymEntry = &Sym; + } + + Section &getCommonSection(); + void addSectionStartSymAndBlock(Section &GraphSec, uint64_t Address, + const char *Data, uint64_t Size, + uint32_t Alignment, bool IsLive); + + Error createNormalizedSections(); + Error createNormalizedSymbols(); + + /// Create graph blocks and symbols for externals, absolutes, commons and + /// all defined symbols in sections without custom parsers. + Error graphifyRegularSymbols(); + + /// Create graph blocks and symbols for all sections. + Error graphifySectionsWithCustomParsers(); + + // Put the BumpPtrAllocator first so that we don't free any of the underlying + // memory until the Symbol/Addressable destructors have been run. + BumpPtrAllocator Allocator; + + const object::MachOObjectFile &Obj; + std::unique_ptr G; + + DenseMap IndexToSection; + Section *CommonSection = nullptr; + + DenseMap IndexToSymbol; + std::map AddrToCanonicalSymbol; + StringMap CustomSectionParserFunctions; +}; + +} // end namespace jitlink +} // end namespace llvm + +#endif // LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp index 52481f8436e9e2..d83787ffd59869 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp @@ -13,7 +13,7 @@ #include "llvm/ExecutionEngine/JITLink/MachO_x86_64.h" #include "BasicGOTAndStubsBuilder.h" -#include "MachOAtomGraphBuilder.h" +#include "MachOLinkGraphBuilder.h" #define DEBUG_TYPE "jitlink" @@ -23,16 +23,21 @@ using namespace llvm::jitlink::MachO_x86_64_Edges; namespace { -class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { +class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder { public: - MachOAtomGraphBuilder_x86_64(const object::MachOObjectFile &Obj) - : MachOAtomGraphBuilder(Obj), - NumSymbols(Obj.getSymtabLoadCommand().nsyms) { - addCustomAtomizer("__eh_frame", [this](MachOSection &EHFrameSection) { - return addEHFrame(getGraph(), EHFrameSection.getGenericSection(), - EHFrameSection.getContent(), - EHFrameSection.getAddress(), NegDelta32, Delta64); - }); + MachOLinkGraphBuilder_x86_64(const object::MachOObjectFile &Obj) + : MachOLinkGraphBuilder(Obj) { + addCustomSectionParser( + "__eh_frame", [this](NormalizedSection &EHFrameSection) { + if (!EHFrameSection.Data) + return make_error( + "__eh_frame section is marked zero-fill"); + return MachOEHFrameBinaryParser( + *this, EHFrameSection.Address, + StringRef(EHFrameSection.Data, EHFrameSection.Size), + *EHFrameSection.GraphSection, 8, 4, NegDelta32, Delta64) + .addToGraph(); + }); } private: @@ -102,17 +107,6 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { ", length=" + formatv("{0:d}", RI.r_length)); } - Expected findAtomBySymbolIndex(const MachO::relocation_info &RI) { - auto &Obj = getObject(); - if (RI.r_symbolnum >= NumSymbols) - return make_error("Symbol index out of range"); - auto SymI = Obj.getSymbolByIndex(RI.r_symbolnum); - auto Name = SymI->getName(); - if (!Name) - return Name.takeError(); - return getGraph().getAtomByName(*Name); - } - MachO::relocation_info getRelocationInfo(const object::relocation_iterator RelItr) { MachO::any_relocation_info ARI = @@ -122,12 +116,12 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { return RI; } - using PairRelocInfo = std::tuple; + using PairRelocInfo = std::tuple; // Parses paired SUBTRACTOR/UNSIGNED relocations and, on success, // returns the edge kind and addend to be used. Expected - parsePairRelocation(DefinedAtom &AtomToFix, Edge::Kind SubtractorKind, + parsePairRelocation(Block &BlockToFix, Edge::Kind SubtractorKind, const MachO::relocation_info &SubRI, JITTargetAddress FixupAddress, const char *FixupContent, object::relocation_iterator &UnsignedRelItr, @@ -154,9 +148,11 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { return make_error("length of x86_64 SUBTRACTOR and paired " "UNSIGNED reloc must match"); - auto FromAtom = findAtomBySymbolIndex(SubRI); - if (!FromAtom) - return FromAtom.takeError(); + Symbol *FromSymbol; + if (auto FromSymbolOrErr = findSymbolByIndex(SubRI.r_symbolnum)) + FromSymbol = FromSymbolOrErr->GraphSymbol; + else + return FromSymbolOrErr.takeError(); // Read the current fixup value. uint64_t FixupValue = 0; @@ -165,54 +161,60 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { else FixupValue = *(const little32_t *)FixupContent; - // Find 'ToAtom' using symbol number or address, depending on whether the + // Find 'ToSymbol' using symbol number or address, depending on whether the // paired UNSIGNED relocation is extern. - Atom *ToAtom = nullptr; + Symbol *ToSymbol = nullptr; if (UnsignedRI.r_extern) { - // Find target atom by symbol index. - if (auto ToAtomOrErr = findAtomBySymbolIndex(UnsignedRI)) - ToAtom = &*ToAtomOrErr; + // Find target symbol by symbol index. + if (auto ToSymbolOrErr = findSymbolByIndex(UnsignedRI.r_symbolnum)) + ToSymbol = ToSymbolOrErr->GraphSymbol; else - return ToAtomOrErr.takeError(); + return ToSymbolOrErr.takeError(); } else { - if (auto ToAtomOrErr = getGraph().findAtomByAddress(FixupValue)) - ToAtom = &*ToAtomOrErr; + if (auto ToSymbolOrErr = findSymbolByAddress(FixupValue)) + ToSymbol = &*ToSymbolOrErr; else - return ToAtomOrErr.takeError(); - FixupValue -= ToAtom->getAddress(); + return ToSymbolOrErr.takeError(); + FixupValue -= ToSymbol->getAddress(); } MachOX86RelocationKind DeltaKind; - Atom *TargetAtom; + Symbol *TargetSymbol; uint64_t Addend; - if (areLayoutLocked(AtomToFix, *FromAtom)) { - TargetAtom = ToAtom; + if (&BlockToFix == &FromSymbol->getAddressable()) { + TargetSymbol = ToSymbol; DeltaKind = (SubRI.r_length == 3) ? Delta64 : Delta32; - Addend = FixupValue + (FixupAddress - FromAtom->getAddress()); + Addend = FixupValue + (FixupAddress - FromSymbol->getAddress()); // FIXME: handle extern 'from'. - } else if (areLayoutLocked(AtomToFix, *ToAtom)) { - TargetAtom = &*FromAtom; + } else if (&BlockToFix == &ToSymbol->getAddressable()) { + TargetSymbol = FromSymbol; DeltaKind = (SubRI.r_length == 3) ? NegDelta64 : NegDelta32; - Addend = FixupValue - (FixupAddress - ToAtom->getAddress()); + Addend = FixupValue - (FixupAddress - ToSymbol->getAddress()); } else { - // AtomToFix was neither FromAtom nor ToAtom. + // BlockToFix was neither FromSymbol nor ToSymbol. return make_error("SUBTRACTOR relocation must fix up " - "either 'A' or 'B' (or an atom in one " - "of their alt-entry groups)"); + "either 'A' or 'B' (or a symbol in one " + "of their alt-entry chains)"); } - return PairRelocInfo(DeltaKind, TargetAtom, Addend); + return PairRelocInfo(DeltaKind, TargetSymbol, Addend); } Error addRelocations() override { using namespace support; - auto &G = getGraph(); auto &Obj = getObject(); for (auto &S : Obj.sections()) { JITTargetAddress SectionAddress = S.getAddress(); + if (S.isVirtual()) { + if (S.relocation_begin() != S.relocation_end()) + return make_error("Virtual section contains " + "relocations"); + continue; + } + for (auto RelItr = S.relocation_begin(), RelEnd = S.relocation_end(); RelItr != RelEnd; ++RelItr) { @@ -231,26 +233,26 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { << format("0x%016" PRIx64, FixupAddress) << "\n"; }); - // Find the atom that the fixup points to. - DefinedAtom *AtomToFix = nullptr; + // Find the block that the fixup points to. + Block *BlockToFix = nullptr; { - auto AtomToFixOrErr = G.findAtomByAddress(FixupAddress); - if (!AtomToFixOrErr) - return AtomToFixOrErr.takeError(); - AtomToFix = &*AtomToFixOrErr; + auto SymbolToFixOrErr = findSymbolByAddress(FixupAddress); + if (!SymbolToFixOrErr) + return SymbolToFixOrErr.takeError(); + BlockToFix = &SymbolToFixOrErr->getBlock(); } if (FixupAddress + static_cast(1ULL << RI.r_length) > - AtomToFix->getAddress() + AtomToFix->getContent().size()) + BlockToFix->getAddress() + BlockToFix->getContent().size()) return make_error( - "Relocation content extends past end of fixup atom"); + "Relocation extends past end of fixup block"); // Get a pointer to the fixup content. - const char *FixupContent = AtomToFix->getContent().data() + - (FixupAddress - AtomToFix->getAddress()); + const char *FixupContent = BlockToFix->getContent().data() + + (FixupAddress - BlockToFix->getAddress()); - // The target atom and addend will be populated by the switch below. - Atom *TargetAtom = nullptr; + // The target symbol and addend will be populated by the switch below. + Symbol *TargetSymbol = nullptr; uint64_t Addend = 0; switch (*Kind) { @@ -258,53 +260,53 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { case PCRel32: case PCRel32GOTLoad: case PCRel32GOT: - if (auto TargetAtomOrErr = findAtomBySymbolIndex(RI)) - TargetAtom = &*TargetAtomOrErr; + if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum)) + TargetSymbol = TargetSymbolOrErr->GraphSymbol; else - return TargetAtomOrErr.takeError(); + return TargetSymbolOrErr.takeError(); Addend = *(const ulittle32_t *)FixupContent; break; case Pointer32: - if (auto TargetAtomOrErr = findAtomBySymbolIndex(RI)) - TargetAtom = &*TargetAtomOrErr; + if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum)) + TargetSymbol = TargetSymbolOrErr->GraphSymbol; else - return TargetAtomOrErr.takeError(); + return TargetSymbolOrErr.takeError(); Addend = *(const ulittle32_t *)FixupContent; break; case Pointer64: - if (auto TargetAtomOrErr = findAtomBySymbolIndex(RI)) - TargetAtom = &*TargetAtomOrErr; + if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum)) + TargetSymbol = TargetSymbolOrErr->GraphSymbol; else - return TargetAtomOrErr.takeError(); + return TargetSymbolOrErr.takeError(); Addend = *(const ulittle64_t *)FixupContent; break; case Pointer64Anon: { JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent; - if (auto TargetAtomOrErr = G.findAtomByAddress(TargetAddress)) - TargetAtom = &*TargetAtomOrErr; + if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress)) + TargetSymbol = &*TargetSymbolOrErr; else - return TargetAtomOrErr.takeError(); - Addend = TargetAddress - TargetAtom->getAddress(); + return TargetSymbolOrErr.takeError(); + Addend = TargetAddress - TargetSymbol->getAddress(); break; } case PCRel32Minus1: case PCRel32Minus2: case PCRel32Minus4: - if (auto TargetAtomOrErr = findAtomBySymbolIndex(RI)) - TargetAtom = &*TargetAtomOrErr; + if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum)) + TargetSymbol = TargetSymbolOrErr->GraphSymbol; else - return TargetAtomOrErr.takeError(); + return TargetSymbolOrErr.takeError(); Addend = *(const ulittle32_t *)FixupContent + (1 << (*Kind - PCRel32Minus1)); break; case PCRel32Anon: { JITTargetAddress TargetAddress = FixupAddress + 4 + *(const ulittle32_t *)FixupContent; - if (auto TargetAtomOrErr = G.findAtomByAddress(TargetAddress)) - TargetAtom = &*TargetAtomOrErr; + if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress)) + TargetSymbol = &*TargetSymbolOrErr; else - return TargetAtomOrErr.takeError(); - Addend = TargetAddress - TargetAtom->getAddress(); + return TargetSymbolOrErr.takeError(); + Addend = TargetAddress - TargetSymbol->getAddress(); break; } case PCRel32Minus1Anon: @@ -314,11 +316,11 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { static_cast(1ULL << (*Kind - PCRel32Minus1Anon)); JITTargetAddress TargetAddress = FixupAddress + 4 + Delta + *(const ulittle32_t *)FixupContent; - if (auto TargetAtomOrErr = G.findAtomByAddress(TargetAddress)) - TargetAtom = &*TargetAtomOrErr; + if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress)) + TargetSymbol = &*TargetSymbolOrErr; else - return TargetAtomOrErr.takeError(); - Addend = TargetAddress - TargetAtom->getAddress(); + return TargetSymbolOrErr.takeError(); + Addend = TargetAddress - TargetSymbol->getAddress(); break; } case Delta32: @@ -329,12 +331,12 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { // NegDelta32/NegDelta64, depending on the direction of the // subtraction) along with the addend. auto PairInfo = - parsePairRelocation(*AtomToFix, *Kind, RI, FixupAddress, + parsePairRelocation(*BlockToFix, *Kind, RI, FixupAddress, FixupContent, ++RelItr, RelEnd); if (!PairInfo) return PairInfo.takeError(); - std::tie(*Kind, TargetAtom, Addend) = *PairInfo; - assert(TargetAtom && "No target atom from parsePairRelocation?"); + std::tie(*Kind, TargetSymbol, Addend) = *PairInfo; + assert(TargetSymbol && "No target symbol from parsePairRelocation?"); break; } default: @@ -343,41 +345,38 @@ class MachOAtomGraphBuilder_x86_64 : public MachOAtomGraphBuilder { } LLVM_DEBUG({ - Edge GE(*Kind, FixupAddress - AtomToFix->getAddress(), *TargetAtom, + Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol, Addend); - printEdge(dbgs(), *AtomToFix, GE, + printEdge(dbgs(), *BlockToFix, GE, getMachOX86RelocationKindName(*Kind)); dbgs() << "\n"; }); - AtomToFix->addEdge(*Kind, FixupAddress - AtomToFix->getAddress(), - *TargetAtom, Addend); + BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(), + *TargetSymbol, Addend); } } return Error::success(); } - - unsigned NumSymbols = 0; }; class MachO_x86_64_GOTAndStubsBuilder : public BasicGOTAndStubsBuilder { public: - MachO_x86_64_GOTAndStubsBuilder(AtomGraph &G) + MachO_x86_64_GOTAndStubsBuilder(LinkGraph &G) : BasicGOTAndStubsBuilder(G) {} bool isGOTEdge(Edge &E) const { return E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad; } - DefinedAtom &createGOTEntry(Atom &Target) { - auto &GOTEntryAtom = G.addAnonymousAtom(getGOTSection(), 0x0, 8); - GOTEntryAtom.setContent( - StringRef(reinterpret_cast(NullGOTEntryContent), 8)); - GOTEntryAtom.addEdge(Pointer64, 0, Target, 0); - return GOTEntryAtom; + Symbol &createGOTEntry(Symbol &Target) { + auto &GOTEntryBlock = G.createContentBlock( + getGOTSection(), getGOTEntryBlockContent(), 0, 8, 0); + GOTEntryBlock.addEdge(Pointer64, 0, Target, 0); + return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false); } - void fixGOTEdge(Edge &E, Atom &GOTEntry) { + void fixGOTEdge(Edge &E, Symbol &GOTEntry) { assert((E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad) && "Not a GOT edge?"); E.setKind(PCRel32); @@ -389,19 +388,16 @@ class MachO_x86_64_GOTAndStubsBuilder return E.getKind() == Branch32 && !E.getTarget().isDefined(); } - DefinedAtom &createStub(Atom &Target) { - auto &StubAtom = G.addAnonymousAtom(getStubsSection(), 0x0, 2); - StubAtom.setContent( - StringRef(reinterpret_cast(StubContent), 6)); - + Symbol &createStub(Symbol &Target) { + auto &StubContentBlock = + G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 1, 0); // Re-use GOT entries for stub targets. - auto &GOTEntryAtom = getGOTEntryAtom(Target); - StubAtom.addEdge(PCRel32, 2, GOTEntryAtom, 0); - - return StubAtom; + auto &GOTEntrySymbol = getGOTEntrySymbol(Target); + StubContentBlock.addEdge(PCRel32, 2, GOTEntrySymbol, 0); + return G.addAnonymousSymbol(StubContentBlock, 0, 6, true, false); } - void fixExternalBranchEdge(Edge &E, Atom &Stub) { + void fixExternalBranchEdge(Edge &E, Symbol &Stub) { assert(E.getKind() == Branch32 && "Not a Branch32 edge?"); assert(E.getAddend() == 0 && "Branch32 edge has non-zero addend?"); E.setTarget(Stub); @@ -410,7 +406,7 @@ class MachO_x86_64_GOTAndStubsBuilder private: Section &getGOTSection() { if (!GOTSection) - GOTSection = &G.createSection("$__GOT", 8, sys::Memory::MF_READ, false); + GOTSection = &G.createSection("$__GOT", sys::Memory::MF_READ); return *GOTSection; } @@ -418,11 +414,21 @@ class MachO_x86_64_GOTAndStubsBuilder if (!StubsSection) { auto StubsProt = static_cast( sys::Memory::MF_READ | sys::Memory::MF_EXEC); - StubsSection = &G.createSection("$__STUBS", 8, StubsProt, false); + StubsSection = &G.createSection("$__STUBS", StubsProt); } return *StubsSection; } + StringRef getGOTEntryBlockContent() { + return StringRef(reinterpret_cast(NullGOTEntryContent), + sizeof(NullGOTEntryContent)); + } + + StringRef getStubBlockContent() { + return StringRef(reinterpret_cast(StubContent), + sizeof(StubContent)); + } + static const uint8_t NullGOTEntryContent[8]; static const uint8_t StubContent[6]; Section *GOTSection = nullptr; @@ -451,30 +457,31 @@ class MachOJITLinker_x86_64 : public JITLinker { return getMachOX86RelocationKindName(R); } - Expected> + Expected> buildGraph(MemoryBufferRef ObjBuffer) override { auto MachOObj = object::ObjectFile::createMachOObjectFile(ObjBuffer); if (!MachOObj) return MachOObj.takeError(); - return MachOAtomGraphBuilder_x86_64(**MachOObj).buildGraph(); + return MachOLinkGraphBuilder_x86_64(**MachOObj).buildGraph(); } - static Error targetOutOfRangeError(const Atom &A, const Edge &E) { + static Error targetOutOfRangeError(const Block &B, const Edge &E) { std::string ErrMsg; { raw_string_ostream ErrStream(ErrMsg); ErrStream << "Relocation target out of range: "; - printEdge(ErrStream, A, E, getMachOX86RelocationKindName(E.getKind())); + printEdge(ErrStream, B, E, getMachOX86RelocationKindName(E.getKind())); ErrStream << "\n"; } return make_error(std::move(ErrMsg)); } - Error applyFixup(DefinedAtom &A, const Edge &E, char *AtomWorkingMem) const { + Error applyFixup(Block &B, const Edge &E, char *BlockWorkingMem) const { + using namespace support; - char *FixupPtr = AtomWorkingMem + E.getOffset(); - JITTargetAddress FixupAddress = A.getAddress() + E.getOffset(); + char *FixupPtr = BlockWorkingMem + E.getOffset(); + JITTargetAddress FixupAddress = B.getAddress() + E.getOffset(); switch (E.getKind()) { case Branch32: @@ -484,7 +491,7 @@ class MachOJITLinker_x86_64 : public JITLinker { E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); if (Value < std::numeric_limits::min() || Value > std::numeric_limits::max()) - return targetOutOfRangeError(A, E); + return targetOutOfRangeError(B, E); *(little32_t *)FixupPtr = Value; break; } @@ -502,7 +509,7 @@ class MachOJITLinker_x86_64 : public JITLinker { E.getTarget().getAddress() - (FixupAddress + Delta) + E.getAddend(); if (Value < std::numeric_limits::min() || Value > std::numeric_limits::max()) - return targetOutOfRangeError(A, E); + return targetOutOfRangeError(B, E); *(little32_t *)FixupPtr = Value; break; } @@ -514,7 +521,7 @@ class MachOJITLinker_x86_64 : public JITLinker { E.getTarget().getAddress() - (FixupAddress + Delta) + E.getAddend(); if (Value < std::numeric_limits::min() || Value > std::numeric_limits::max()) - return targetOutOfRangeError(A, E); + return targetOutOfRangeError(B, E); *(little32_t *)FixupPtr = Value; break; } @@ -531,7 +538,7 @@ class MachOJITLinker_x86_64 : public JITLinker { if (E.getKind() == Delta32 || E.getKind() == NegDelta32) { if (Value < std::numeric_limits::min() || Value > std::numeric_limits::max()) - return targetOutOfRangeError(A, E); + return targetOutOfRangeError(B, E); *(little32_t *)FixupPtr = Value; } else *(little64_t *)FixupPtr = Value; @@ -540,7 +547,7 @@ class MachOJITLinker_x86_64 : public JITLinker { case Pointer32: { uint64_t Value = E.getTarget().getAddress() + E.getAddend(); if (Value > std::numeric_limits::max()) - return targetOutOfRangeError(A, E); + return targetOutOfRangeError(B, E); *(ulittle32_t *)FixupPtr = Value; break; } @@ -563,10 +570,10 @@ void jitLink_MachO_x86_64(std::unique_ptr Ctx) { if (auto MarkLive = Ctx->getMarkLivePass(TT)) Config.PrePrunePasses.push_back(std::move(MarkLive)); else - Config.PrePrunePasses.push_back(markAllAtomsLive); + Config.PrePrunePasses.push_back(markAllSymbolsLive); // Add an in-place GOT/Stubs pass. - Config.PostPrunePasses.push_back([](AtomGraph &G) -> Error { + Config.PostPrunePasses.push_back([](LinkGraph &G) -> Error { MachO_x86_64_GOTAndStubsBuilder(G).run(); return Error::success(); }); diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index 436fd55cd8ddf0..5c7d888c2d6e19 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -226,7 +226,7 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) { for (auto &KV : Aliases) OS << " " << *KV.first << ": " << KV.second.Aliasee << " " << KV.second.AliasFlags; - OS << " }\n"; + OS << " }"; return OS; } @@ -378,15 +378,12 @@ Error MaterializationResponsibility::notifyResolved(const SymbolMap &Symbols) { }); #ifndef NDEBUG for (auto &KV : Symbols) { + auto WeakFlags = JITSymbolFlags::Weak | JITSymbolFlags::Common; auto I = SymbolFlags.find(KV.first); assert(I != SymbolFlags.end() && "Resolving symbol outside this responsibility set"); - if (I->second.isWeak()) - assert(I->second == (KV.second.getFlags() | JITSymbolFlags::Weak) && - "Resolving symbol with incorrect flags"); - else - assert(I->second == KV.second.getFlags() && - "Resolving symbol with incorrect flags"); + assert((KV.second.getFlags() & ~WeakFlags) == (I->second & ~WeakFlags) && + "Resolving symbol with incorrect flags"); } #endif @@ -949,11 +946,14 @@ Error JITDylib::resolve(const SymbolMap &Resolved) { if (SymI->second.getFlags().hasError()) SymbolsInErrorState.insert(KV.first); else { - assert((KV.second.getFlags() & ~JITSymbolFlags::Weak) == - (SymI->second.getFlags() & ~JITSymbolFlags::Weak) && + auto Flags = KV.second.getFlags(); + Flags &= ~(JITSymbolFlags::Weak | JITSymbolFlags::Common); + assert(Flags == (SymI->second.getFlags() & + ~(JITSymbolFlags::Weak | JITSymbolFlags::Common)) && "Resolved flags should match the declared flags"); - Worklist.push_back({SymI, KV.second}); + Worklist.push_back( + {SymI, JITEvaluatedSymbol(KV.second.getAddress(), Flags)}); } } @@ -970,7 +970,6 @@ Error JITDylib::resolve(const SymbolMap &Resolved) { // Resolved symbols can not be weak: discard the weak flag. JITSymbolFlags ResolvedFlags = ResolvedSym.getFlags(); - ResolvedFlags &= ~JITSymbolFlags::Weak; SymI->second.setAddress(ResolvedSym.getAddress()); SymI->second.setFlags(ResolvedFlags); SymI->second.setState(SymbolState::Resolved); diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index c20d7d1d0faf03..4a886ac0597c12 100644 --- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -88,7 +88,7 @@ iterator_range getDestructors(const Module &M) { } void CtorDtorRunner::add(iterator_range CtorDtors) { - if (empty(CtorDtors)) + if (CtorDtors.empty()) return; MangleAndInterner Mangle( diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index e1b8d52acb4296..952ca6071ffb8a 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -41,7 +41,7 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } void lookup(const DenseSet &Symbols, - JITLinkAsyncLookupContinuation LookupContinuation) override { + std::unique_ptr LC) override { JITDylibSearchList SearchOrder; MR.getTargetJITDylib().withSearchOrderDo( @@ -54,19 +54,16 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { InternedSymbols.insert(ES.intern(S)); // OnResolve -- De-intern the symbols and pass the result to the linker. - // FIXME: Capture LookupContinuation by move once we have c++14. - auto SharedLookupContinuation = - std::make_shared( - std::move(LookupContinuation)); - auto OnResolve = [this, SharedLookupContinuation](Expected Result) { + auto OnResolve = [this, LookupContinuation = std::move(LC)]( + Expected Result) mutable { auto Main = Layer.getExecutionSession().intern("_main"); if (!Result) - (*SharedLookupContinuation)(Result.takeError()); + LookupContinuation->run(Result.takeError()); else { AsyncLookupResult LR; for (auto &KV : *Result) LR[*KV.first] = KV.second; - (*SharedLookupContinuation)(std::move(LR)); + LookupContinuation->run(std::move(LR)); } }; @@ -76,29 +73,25 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { }); } - void notifyResolved(AtomGraph &G) override { + void notifyResolved(LinkGraph &G) override { auto &ES = Layer.getExecutionSession(); SymbolFlagsMap ExtraSymbolsToClaim; bool AutoClaim = Layer.AutoClaimObjectSymbols; SymbolMap InternedResult; - for (auto *DA : G.defined_atoms()) - if (DA->hasName() && DA->isGlobal()) { - auto InternedName = ES.intern(DA->getName()); + for (auto *Sym : G.defined_symbols()) + if (Sym->hasName() && Sym->getScope() != Scope::Local) { + auto InternedName = ES.intern(Sym->getName()); JITSymbolFlags Flags; - if (DA->isExported()) - Flags |= JITSymbolFlags::Exported; - if (DA->isWeak()) - Flags |= JITSymbolFlags::Weak; - if (DA->isCallable()) + if (Sym->isCallable()) Flags |= JITSymbolFlags::Callable; - if (DA->isCommon()) - Flags |= JITSymbolFlags::Common; + if (Sym->getScope() == Scope::Default) + Flags |= JITSymbolFlags::Exported; InternedResult[InternedName] = - JITEvaluatedSymbol(DA->getAddress(), Flags); + JITEvaluatedSymbol(Sym->getAddress(), Flags); if (AutoClaim && !MR.getSymbols().count(InternedName)) { assert(!ExtraSymbolsToClaim.count(InternedName) && "Duplicate symbol to claim?"); @@ -106,17 +99,17 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } } - for (auto *A : G.absolute_atoms()) - if (A->hasName()) { - auto InternedName = ES.intern(A->getName()); + for (auto *Sym : G.absolute_symbols()) + if (Sym->hasName()) { + auto InternedName = ES.intern(Sym->getName()); JITSymbolFlags Flags; Flags |= JITSymbolFlags::Absolute; - if (A->isWeak()) - Flags |= JITSymbolFlags::Weak; - if (A->isCallable()) + if (Sym->isCallable()) Flags |= JITSymbolFlags::Callable; + if (Sym->getLinkage() == Linkage::Weak) + Flags |= JITSymbolFlags::Weak; InternedResult[InternedName] = - JITEvaluatedSymbol(A->getAddress(), Flags); + JITEvaluatedSymbol(Sym->getAddress(), Flags); if (AutoClaim && !MR.getSymbols().count(InternedName)) { assert(!ExtraSymbolsToClaim.count(InternedName) && "Duplicate symbol to claim?"); @@ -148,17 +141,17 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } } - AtomGraphPassFunction getMarkLivePass(const Triple &TT) const override { - return [this](AtomGraph &G) { return markResponsibilitySymbolsLive(G); }; + LinkGraphPassFunction getMarkLivePass(const Triple &TT) const override { + return [this](LinkGraph &G) { return markResponsibilitySymbolsLive(G); }; } Error modifyPassConfig(const Triple &TT, PassConfiguration &Config) override { // Add passes to mark duplicate defs as should-discard, and to walk the - // atom graph to build the symbol dependence graph. + // link graph to build the symbol dependence graph. Config.PrePrunePasses.push_back( - [this](AtomGraph &G) { return markSymbolsToDiscard(G); }); + [this](LinkGraph &G) { return externalizeWeakAndCommonSymbols(G); }); Config.PostPrunePasses.push_back( - [this](AtomGraph &G) { return computeNamedSymbolDependencies(G); }); + [this](LinkGraph &G) { return computeNamedSymbolDependencies(G); }); Layer.modifyPassConfig(MR, TT, Config); @@ -166,65 +159,59 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { } private: - using AnonAtomNamedDependenciesMap = - DenseMap; + using AnonToNamedDependenciesMap = DenseMap; - Error markSymbolsToDiscard(AtomGraph &G) { + Error externalizeWeakAndCommonSymbols(LinkGraph &G) { auto &ES = Layer.getExecutionSession(); - for (auto *DA : G.defined_atoms()) - if (DA->isWeak() && DA->hasName()) { - auto S = ES.intern(DA->getName()); - auto I = MR.getSymbols().find(S); - if (I == MR.getSymbols().end()) - DA->setShouldDiscard(true); + for (auto *Sym : G.defined_symbols()) + if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) { + if (!MR.getSymbols().count(ES.intern(Sym->getName()))) + G.makeExternal(*Sym); } - for (auto *A : G.absolute_atoms()) - if (A->isWeak() && A->hasName()) { - auto S = ES.intern(A->getName()); - auto I = MR.getSymbols().find(S); - if (I == MR.getSymbols().end()) - A->setShouldDiscard(true); + for (auto *Sym : G.absolute_symbols()) + if (Sym->hasName() && Sym->getLinkage() == Linkage::Weak) { + if (!MR.getSymbols().count(ES.intern(Sym->getName()))) + G.makeExternal(*Sym); } return Error::success(); } - Error markResponsibilitySymbolsLive(AtomGraph &G) const { + Error markResponsibilitySymbolsLive(LinkGraph &G) const { auto &ES = Layer.getExecutionSession(); - for (auto *DA : G.defined_atoms()) - if (DA->hasName() && - MR.getSymbols().count(ES.intern(DA->getName()))) - DA->setLive(true); + for (auto *Sym : G.defined_symbols()) + if (Sym->hasName() && MR.getSymbols().count(ES.intern(Sym->getName()))) + Sym->setLive(true); return Error::success(); } - Error computeNamedSymbolDependencies(AtomGraph &G) { + Error computeNamedSymbolDependencies(LinkGraph &G) { auto &ES = MR.getTargetJITDylib().getExecutionSession(); auto AnonDeps = computeAnonDeps(G); - for (auto *DA : G.defined_atoms()) { + for (auto *Sym : G.defined_symbols()) { // Skip anonymous and non-global atoms: we do not need dependencies for // these. - if (!DA->hasName() || !DA->isGlobal()) + if (Sym->getScope() == Scope::Local) continue; - auto DAName = ES.intern(DA->getName()); - SymbolNameSet &DADeps = NamedSymbolDeps[DAName]; + auto SymName = ES.intern(Sym->getName()); + SymbolNameSet &SymDeps = NamedSymbolDeps[SymName]; - for (auto &E : DA->edges()) { - auto &TA = E.getTarget(); + for (auto &E : Sym->getBlock().edges()) { + auto &TargetSym = E.getTarget(); - if (TA.hasName()) - DADeps.insert(ES.intern(TA.getName())); + if (TargetSym.getScope() != Scope::Local) + SymDeps.insert(ES.intern(TargetSym.getName())); else { - assert(TA.isDefined() && "Anonymous atoms must be defined"); - auto &DTA = static_cast(TA); - auto I = AnonDeps.find(&DTA); + assert(TargetSym.isDefined() && + "Anonymous/local symbols must be defined"); + auto I = AnonDeps.find(&TargetSym); if (I != AnonDeps.end()) for (auto &S : I->second) - DADeps.insert(S); + SymDeps.insert(S); } } } @@ -232,58 +219,59 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext { return Error::success(); } - AnonAtomNamedDependenciesMap computeAnonDeps(AtomGraph &G) { + AnonToNamedDependenciesMap computeAnonDeps(LinkGraph &G) { auto &ES = MR.getTargetJITDylib().getExecutionSession(); - AnonAtomNamedDependenciesMap DepMap; + AnonToNamedDependenciesMap DepMap; - // For all anonymous atoms: + // For all anonymous symbols: // (1) Add their named dependencies. // (2) Add them to the worklist for further iteration if they have any - // depend on any other anonymous atoms. + // depend on any other anonymous symbols. struct WorklistEntry { - WorklistEntry(DefinedAtom *DA, DenseSet DAAnonDeps) - : DA(DA), DAAnonDeps(std::move(DAAnonDeps)) {} + WorklistEntry(Symbol *Sym, DenseSet SymAnonDeps) + : Sym(Sym), SymAnonDeps(std::move(SymAnonDeps)) {} - DefinedAtom *DA = nullptr; - DenseSet DAAnonDeps; + Symbol *Sym = nullptr; + DenseSet SymAnonDeps; }; std::vector Worklist; - for (auto *DA : G.defined_atoms()) - if (!DA->hasName()) { - auto &DANamedDeps = DepMap[DA]; - DenseSet DAAnonDeps; - - for (auto &E : DA->edges()) { - auto &TA = E.getTarget(); - if (TA.hasName()) - DANamedDeps.insert(ES.intern(TA.getName())); + for (auto *Sym : G.defined_symbols()) + if (!Sym->hasName()) { + auto &SymNamedDeps = DepMap[Sym]; + DenseSet SymAnonDeps; + + for (auto &E : Sym->getBlock().edges()) { + auto &TargetSym = E.getTarget(); + if (TargetSym.hasName()) + SymNamedDeps.insert(ES.intern(TargetSym.getName())); else { - assert(TA.isDefined() && "Anonymous atoms must be defined"); - DAAnonDeps.insert(static_cast(&TA)); + assert(TargetSym.isDefined() && + "Anonymous symbols must be defined"); + SymAnonDeps.insert(&TargetSym); } } - if (!DAAnonDeps.empty()) - Worklist.push_back(WorklistEntry(DA, std::move(DAAnonDeps))); + if (!SymAnonDeps.empty()) + Worklist.push_back(WorklistEntry(Sym, std::move(SymAnonDeps))); } - // Loop over all anonymous atoms with anonymous dependencies, propagating + // Loop over all anonymous symbols with anonymous dependencies, propagating // their respective *named* dependencies. Iterate until we hit a stable // state. bool Changed; do { Changed = false; for (auto &WLEntry : Worklist) { - auto *DA = WLEntry.DA; - auto &DANamedDeps = DepMap[DA]; - auto &DAAnonDeps = WLEntry.DAAnonDeps; + auto *Sym = WLEntry.Sym; + auto &SymNamedDeps = DepMap[Sym]; + auto &SymAnonDeps = WLEntry.SymAnonDeps; - for (auto *TA : DAAnonDeps) { - auto I = DepMap.find(TA); + for (auto *TargetSym : SymAnonDeps) { + auto I = DepMap.find(TargetSym); if (I != DepMap.end()) for (const auto &S : I->second) - Changed |= DANamedDeps.insert(S).second; + Changed |= SymNamedDeps.insert(S).second; } } } while (Changed); @@ -414,7 +402,7 @@ Error ObjectLinkingLayer::removeAllModules() { } EHFrameRegistrationPlugin::EHFrameRegistrationPlugin( - jitlink::EHFrameRegistrar &Registrar) + EHFrameRegistrar &Registrar) : Registrar(Registrar) {} void EHFrameRegistrationPlugin::modifyPassConfig( diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index a599a08998767b..c548c56211ae82 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -2014,7 +2014,7 @@ void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) { else if (LoadInst *LI = dyn_cast(P)) LI->setAlignment(MaybeAlign(Bytes)); else if (StoreInst *SI = dyn_cast(P)) - SI->setAlignment(Bytes); + SI->setAlignment(MaybeAlign(Bytes)); else llvm_unreachable( "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment"); diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index ce47ef2074343d..1bbe6b85d26008 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -279,7 +279,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) { } static MDNode *stripDebugLocFromLoopID(MDNode *N) { - assert(!empty(N->operands()) && "Missing self reference?"); + assert(!N->operands().empty() && "Missing self reference?"); // if there is no debug location, we do not have to rewrite this MDNode. if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) { @@ -929,6 +929,26 @@ const char *LLVMDIFileGetSource(LLVMMetadataRef File, unsigned *Len) { return ""; } +LLVMMetadataRef LLVMDIBuilderCreateMacro(LLVMDIBuilderRef Builder, + LLVMMetadataRef ParentMacroFile, + unsigned Line, + LLVMDWARFMacinfoRecordType RecordType, + const char *Name, size_t NameLen, + const char *Value, size_t ValueLen) { + return wrap( + unwrap(Builder)->createMacro(unwrapDI(ParentMacroFile), Line, + static_cast(RecordType), + {Name, NameLen}, {Value, ValueLen})); +} + +LLVMMetadataRef +LLVMDIBuilderCreateTempMacroFile(LLVMDIBuilderRef Builder, + LLVMMetadataRef ParentMacroFile, unsigned Line, + LLVMMetadataRef File) { + return wrap(unwrap(Builder)->createTempMacroFile( + unwrapDI(ParentMacroFile), Line, unwrapDI(File))); +} + LLVMMetadataRef LLVMDIBuilderCreateEnumerator(LLVMDIBuilderRef Builder, const char *Name, size_t NameLen, int64_t Value, diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 5c22109ffd5bc5..3f70d2c904e5ce 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -706,7 +706,8 @@ enum IIT_Info { IIT_VEC_ELEMENT = 42, IIT_SCALABLE_VEC = 43, IIT_SUBDIVIDE2_ARG = 44, - IIT_SUBDIVIDE4_ARG = 45 + IIT_SUBDIVIDE4_ARG = 45, + IIT_VEC_OF_BITCASTS_TO_INT = 46 }; static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, @@ -895,6 +896,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, DecodeIITType(NextElt, Infos, OutputTable); return; } + case IIT_VEC_OF_BITCASTS_TO_INT: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfBitcastsToInt, + ArgInfo)); + return; + } } llvm_unreachable("unhandled"); } @@ -1021,6 +1028,12 @@ static Type *DecodeFixedType(ArrayRef &Infos, return VTy->getElementType(); llvm_unreachable("Expected an argument of Vector Type"); } + case IITDescriptor::VecOfBitcastsToInt: { + Type *Ty = Tys[D.getArgumentNumber()]; + VectorType *VTy = dyn_cast(Ty); + assert(VTy && "Expected an argument of Vector Type"); + return VectorType::getInteger(VTy); + } case IITDescriptor::VecOfAnyPtrsToElt: // Return the overloaded type (which determines the pointers address space) return Tys[D.getOverloadArgNumber()]; @@ -1314,6 +1327,15 @@ static bool matchIntrinsicType( return matchIntrinsicType(VTy, Infos, ArgTys, DeferredChecks, IsDeferredCheck); } + case IITDescriptor::VecOfBitcastsToInt: { + if (D.getArgumentNumber() >= ArgTys.size()) + return IsDeferredCheck || DeferCheck(Ty); + auto *ReferenceType = dyn_cast(ArgTys[D.getArgumentNumber()]); + auto *ThisArgVecTy = dyn_cast(Ty); + if (!ThisArgVecTy || !ReferenceType) + return true; + return ThisArgVecTy != VectorType::getInteger(ReferenceType); + } } llvm_unreachable("unhandled"); } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 0f000623bdb51e..de1317ea9d3feb 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1397,7 +1397,7 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); - setAlignment(Align); + setAlignment(MaybeAlign(Align)); setAtomic(Order, SSID); AssertOK(); } @@ -1413,15 +1413,11 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); - setAlignment(Align); + setAlignment(MaybeAlign(Align)); setAtomic(Order, SSID); AssertOK(); } -void StoreInst::setAlignment(unsigned Align) { - setAlignment(llvm::MaybeAlign(Align)); -} - void StoreInst::setAlignment(MaybeAlign Align) { assert((!Align || *Align <= MaximumAlignment) && "Alignment is greater than MaximumAlignment!"); diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index d3c948d6de3850..26ed46a9cd9187 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -200,10 +200,14 @@ bool ConstrainedFPIntrinsic::isUnaryOp() const { case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: return true; diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 65b98d382cc475..c44d4b4f2bcc00 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -455,6 +455,7 @@ namespace { // Various metrics for how much to strip off of pointers. enum PointerStripKind { PSK_ZeroIndices, + PSK_ZeroIndicesAndAliases, PSK_ZeroIndicesSameRepresentation, PSK_ZeroIndicesAndInvariantGroups, PSK_InBoundsConstantIndices, @@ -475,6 +476,7 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) { if (auto *GEP = dyn_cast(V)) { switch (StripKind) { case PSK_ZeroIndices: + case PSK_ZeroIndicesAndAliases: case PSK_ZeroIndicesSameRepresentation: case PSK_ZeroIndicesAndInvariantGroups: if (!GEP->hasAllZeroIndices()) @@ -497,6 +499,8 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) { // TODO: If we know an address space cast will not change the // representation we could look through it here as well. V = cast(V)->getOperand(0); + } else if (StripKind == PSK_ZeroIndicesAndAliases && isa(V)) { + V = cast(V)->getAliasee(); } else { if (const auto *Call = dyn_cast(V)) { if (const Value *RV = Call->getReturnedArgOperand()) { @@ -526,6 +530,10 @@ const Value *Value::stripPointerCasts() const { return stripPointerCastsAndOffsets(this); } +const Value *Value::stripPointerCastsAndAliases() const { + return stripPointerCastsAndOffsets(this); +} + const Value *Value::stripPointerCastsSameRepresentation() const { return stripPointerCastsAndOffsets(this); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 2d274de51766a8..739b1c23e72bd8 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4311,12 +4311,16 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { case Intrinsic::experimental_constrained_log: case Intrinsic::experimental_constrained_log10: case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: case Intrinsic::experimental_constrained_maxnum: case Intrinsic::experimental_constrained_minnum: case Intrinsic::experimental_constrained_ceil: case Intrinsic::experimental_constrained_floor: + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: case Intrinsic::experimental_constrained_round: case Intrinsic::experimental_constrained_trunc: visitConstrainedFPIntrinsic(cast(Call)); @@ -4769,6 +4773,31 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { HasRoundingMD = true; break; + case Intrinsic::experimental_constrained_lrint: + case Intrinsic::experimental_constrained_llrint: { + Assert((NumOperands == 3), "invalid arguments for constrained FP intrinsic", + &FPI); + Type *ValTy = FPI.getArgOperand(0)->getType(); + Type *ResultTy = FPI.getType(); + Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(), + "Intrinsic does not support vectors", &FPI); + HasExceptionMD = true; + HasRoundingMD = true; + } + break; + + case Intrinsic::experimental_constrained_lround: + case Intrinsic::experimental_constrained_llround: { + Assert((NumOperands == 2), "invalid arguments for constrained FP intrinsic", + &FPI); + Type *ValTy = FPI.getArgOperand(0)->getType(); + Type *ResultTy = FPI.getType(); + Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(), + "Intrinsic does not support vectors", &FPI); + HasExceptionMD = true; + break; + } + case Intrinsic::experimental_constrained_fma: Assert((NumOperands == 5), "invalid arguments for constrained FP intrinsic", &FPI); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 9e16cba44fde3b..6c5858b942a3f0 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1304,11 +1304,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries, ImportLists, ExportLists); - // Update local devirtualized targets that were exported by cross-module - // importing - updateIndexWPDForExports(ThinLTO.CombinedIndex, ExportLists, - LocalWPDTargetsMap); - // Figure out which symbols need to be internalized. This also needs to happen // at -O0 because summary-based DCE is implemented using internalization, and // we must apply DCE consistently with the full LTO module in order to avoid @@ -1338,6 +1333,12 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, ExportList->second.count(GUID)) || ExportedGUIDs.count(GUID); }; + + // Update local devirtualized targets that were exported by cross-module + // importing or by other devirtualizations marked in the ExportedGUIDs set. + updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported, + LocalWPDTargetsMap); + auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) { return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath(); diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp index 1651d60565052e..f9c31f335f112a 100644 --- a/llvm/lib/ObjectYAML/ELFEmitter.cpp +++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp @@ -174,6 +174,10 @@ template class ELFState { void writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::HashSection &Section, ContiguousBlobAccumulator &CBA); + void writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA); + ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH); public: @@ -423,6 +427,8 @@ void ELFState::initSectionHeaders(std::vector &SHeaders, writeSectionContent(SHeader, *S, CBA); } else if (auto S = dyn_cast(Sec)) { writeSectionContent(SHeader, *S, CBA); + } else if (auto S = dyn_cast(Sec)) { + writeSectionContent(SHeader, *S, CBA); } else { llvm_unreachable("Unknown section type"); } @@ -824,8 +830,8 @@ void ELFState::writeSectionContent(Elf_Shdr &SHeader, if (Section.Link.empty() && SN2I.lookup(".dynsym", Link)) SHeader.sh_link = Link; - if (Section.Content) { - SHeader.sh_size = writeContent(OS, Section.Content, None); + if (Section.Content || Section.Size) { + SHeader.sh_size = writeContent(OS, Section.Content, Section.Size); return; } @@ -990,6 +996,30 @@ void ELFState::writeSectionContent(Elf_Shdr &SHeader, Section.Content->writeAsBinary(OS); } +template +void ELFState::writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA) { + raw_ostream &OS = + CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign); + + unsigned Link = 0; + if (Section.Link.empty() && SN2I.lookup(".symtab", Link)) + SHeader.sh_link = Link; + + if (Section.Content || Section.Size) { + SHeader.sh_size = writeContent(OS, Section.Content, Section.Size); + return; + } + + for (const ELFYAML::AddrsigSymbol &Sym : *Section.Symbols) { + uint64_t Val = + Sym.Name ? toSymbolIndex(*Sym.Name, Section.Name, /*IsDynamic=*/false) + : (uint32_t)*Sym.Index; + SHeader.sh_size += encodeULEB128(Val, OS); + } +} + template void ELFState::buildSectionIndex() { for (unsigned I = 0, E = Doc.Sections.size(); I != E; ++I) { StringRef Name = Doc.Sections[I]->Name; diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 3a09a11ad8398c..29585abe6e80bb 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -1029,6 +1029,7 @@ static void sectionMapping(IO &IO, ELFYAML::HashSection &Section) { IO.mapOptional("Content", Section.Content); IO.mapOptional("Bucket", Section.Bucket); IO.mapOptional("Chain", Section.Chain); + IO.mapOptional("Size", Section.Size); } static void sectionMapping(IO &IO, ELFYAML::NoBitsSection &Section) { @@ -1070,6 +1071,13 @@ static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) { IO.mapRequired("Entries", Section.Entries); } +static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) { + commonSectionMapping(IO, Section); + IO.mapOptional("Content", Section.Content); + IO.mapOptional("Size", Section.Size); + IO.mapOptional("Symbols", Section.Symbols); +} + void MappingTraits::mapping( IO &IO, ELFYAML::SectionOrType §ionOrType) { IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType); @@ -1160,6 +1168,11 @@ void MappingTraits>::mapping( Section.reset(new ELFYAML::SymtabShndxSection()); sectionMapping(IO, *cast(Section.get())); break; + case ELF::SHT_LLVM_ADDRSIG: + if (!IO.outputting()) + Section.reset(new ELFYAML::AddrsigSection()); + sectionMapping(IO, *cast(Section.get())); + break; default: if (!IO.outputting()) { StringRef Name; @@ -1210,14 +1223,20 @@ StringRef MappingTraits>::validate( } if (const auto *HS = dyn_cast(Section.get())) { - if (!HS->Content && !HS->Bucket && !HS->Chain) - return "one of \"Content\", \"Bucket\" or \"Chain\" must be specified"; + if (!HS->Content && !HS->Bucket && !HS->Chain && !HS->Size) + return "one of \"Content\", \"Size\", \"Bucket\" or \"Chain\" must be " + "specified"; + + if (HS->Content || HS->Size) { + if (HS->Size && HS->Content && + (uint64_t)*HS->Size < HS->Content->binary_size()) + return "\"Size\" must be greater than or equal to the content " + "size"; - if (HS->Content) { if (HS->Bucket) - return "\"Content\" and \"Bucket\" cannot be used together"; + return "\"Bucket\" cannot be used with \"Content\" or \"Size\""; if (HS->Chain) - return "\"Content\" and \"Chain\" cannot be used together"; + return "\"Chain\" cannot be used with \"Content\" or \"Size\""; return {}; } @@ -1226,6 +1245,31 @@ StringRef MappingTraits>::validate( return {}; } + if (const auto *Sec = dyn_cast(Section.get())) { + if (!Sec->Symbols && !Sec->Content && !Sec->Size) + return "one of \"Content\", \"Size\" or \"Symbols\" must be specified"; + + if (Sec->Content || Sec->Size) { + if (Sec->Size && Sec->Content && + (uint64_t)*Sec->Size < Sec->Content->binary_size()) + return "\"Size\" must be greater than or equal to the content " + "size"; + + if (Sec->Symbols) + return "\"Symbols\" cannot be used with \"Content\" or \"Size\""; + return {}; + } + + if (!Sec->Symbols) + return {}; + + for (const ELFYAML::AddrsigSymbol &AS : *Sec->Symbols) + if (AS.Index && AS.Name) + return "\"Index\" and \"Name\" cannot be used together when defining a " + "symbol"; + return {}; + } + return {}; } @@ -1333,6 +1377,12 @@ void MappingTraits::mapping(IO &IO, ELFYAML::Object &Object) { IO.setContext(nullptr); } +void MappingTraits::mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapOptional("Name", Sym.Name); + IO.mapOptional("Index", Sym.Index); +} + LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG) LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP) LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_EXT) diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp index e94848df24e20f..003e8d4d429690 100644 --- a/llvm/lib/ProfileData/SampleProf.cpp +++ b/llvm/lib/ProfileData/SampleProf.cpp @@ -15,7 +15,6 @@ #include "llvm/Config/llvm-config.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/Compiler.h" -#include "llvm/Support/Compression.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" @@ -156,6 +155,7 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const { FS.second.print(OS, Indent + 4); } } + OS.indent(Indent); OS << "}\n"; } else { OS << "No inlined callsites in this function\n"; @@ -198,66 +198,34 @@ FunctionSamples::findFunctionSamples(const DILocation *DIL) const { LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); } #endif -std::error_code ProfileSymbolList::read(uint64_t CompressSize, - uint64_t UncompressSize, - const uint8_t *Data) { +std::error_code ProfileSymbolList::read(const uint8_t *Data, + uint64_t ListSize) { const char *ListStart = reinterpret_cast(Data); - // CompressSize being non-zero means the profile is compressed and - // needs to be uncompressed first. - if (CompressSize) { - if (!llvm::zlib::isAvailable()) - return sampleprof_error::zlib_unavailable; - - StringRef CompressedStrings(reinterpret_cast(Data), - CompressSize); - char *Buffer = Allocator.Allocate(UncompressSize); - size_t UCSize = UncompressSize; - llvm::Error E = zlib::uncompress(CompressedStrings, Buffer, UCSize); - if (E) - return sampleprof_error::uncompress_failed; - ListStart = Buffer; - } - uint64_t Size = 0; - while (Size < UncompressSize) { + while (Size < ListSize) { StringRef Str(ListStart + Size); add(Str); Size += Str.size() + 1; } + if (Size != ListSize) + return sampleprof_error::malformed; return sampleprof_error::success; } std::error_code ProfileSymbolList::write(raw_ostream &OS) { - // Sort the symbols before doing compression. It will make the - // compression much more effective. + // Sort the symbols before output. If doing compression. + // It will make the compression much more effective. std::vector SortedList; SortedList.insert(SortedList.begin(), Syms.begin(), Syms.end()); llvm::sort(SortedList); - std::string UncompressedStrings; + std::string OutputString; for (auto &Sym : SortedList) { - UncompressedStrings.append(Sym.str()); - UncompressedStrings.append(1, '\0'); + OutputString.append(Sym.str()); + OutputString.append(1, '\0'); } - if (ToCompress) { - if (!llvm::zlib::isAvailable()) - return sampleprof_error::zlib_unavailable; - SmallString<128> CompressedStrings; - llvm::Error E = zlib::compress(UncompressedStrings, CompressedStrings, - zlib::BestSizeCompression); - if (E) - return sampleprof_error::compress_failed; - encodeULEB128(UncompressedStrings.size(), OS); - encodeULEB128(CompressedStrings.size(), OS); - OS << CompressedStrings.str(); - } else { - encodeULEB128(UncompressedStrings.size(), OS); - // If profile symbol list is not compressed, we will still save - // a compressed size value, but the value of the size is 0. - encodeULEB128(0, OS); - OS << UncompressedStrings; - } + OS << OutputString; return sampleprof_error::success; } diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 07272ebac0a9e5..6d00404b0bb22b 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/ProfileCommon.h" #include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/LineIterator.h" @@ -471,6 +472,7 @@ std::error_code SampleProfileReaderExtBinary::readOneSection(const uint8_t *Start, uint64_t Size, SecType Type) { Data = Start; + End = Start + Size; switch (Type) { case SecProfSummary: if (std::error_code EC = readSummary()) @@ -487,7 +489,7 @@ SampleProfileReaderExtBinary::readOneSection(const uint8_t *Start, } break; case SecProfileSymbolList: - if (std::error_code EC = readProfileSymbolList()) + if (std::error_code EC = readProfileSymbolList(Size)) return EC; break; default: @@ -496,27 +498,44 @@ SampleProfileReaderExtBinary::readOneSection(const uint8_t *Start, return sampleprof_error::success; } -std::error_code SampleProfileReaderExtBinary::readProfileSymbolList() { - auto UncompressSize = readNumber(); - if (std::error_code EC = UncompressSize.getError()) - return EC; +std::error_code +SampleProfileReaderExtBinary::readProfileSymbolList(uint64_t Size) { + if (!ProfSymList) + ProfSymList = std::make_unique(); - auto CompressSize = readNumber(); - if (std::error_code EC = CompressSize.getError()) + if (std::error_code EC = ProfSymList->read(Data, Size)) return EC; - if (!ProfSymList) - ProfSymList = std::make_unique(); + Data = Data + Size; + return sampleprof_error::success; +} - if (std::error_code EC = - ProfSymList->read(*CompressSize, *UncompressSize, Data)) +std::error_code SampleProfileReaderExtBinaryBase::decompressSection( + const uint8_t *SecStart, const uint64_t SecSize, + const uint8_t *&DecompressBuf, uint64_t &DecompressBufSize) { + Data = SecStart; + End = SecStart + SecSize; + auto DecompressSize = readNumber(); + if (std::error_code EC = DecompressSize.getError()) return EC; + DecompressBufSize = *DecompressSize; - // CompressSize is zero only when ProfileSymbolList is not compressed. - if (*CompressSize == 0) - Data = Data + *UncompressSize; - else - Data = Data + *CompressSize; + auto CompressSize = readNumber(); + if (std::error_code EC = CompressSize.getError()) + return EC; + + if (!llvm::zlib::isAvailable()) + return sampleprof_error::zlib_unavailable; + + StringRef CompressedStrings(reinterpret_cast(Data), + *CompressSize); + char *Buffer = Allocator.Allocate(DecompressBufSize); + size_t UCSize = DecompressBufSize; + llvm::Error E = + zlib::uncompress(CompressedStrings, Buffer, UCSize); + if (E) + return sampleprof_error::uncompress_failed; + DecompressBuf = reinterpret_cast(Buffer); return sampleprof_error::success; } @@ -528,11 +547,35 @@ std::error_code SampleProfileReaderExtBinaryBase::read() { // Skip empty section. if (!Entry.Size) continue; + const uint8_t *SecStart = BufStart + Entry.Offset; - if (std::error_code EC = readOneSection(SecStart, Entry.Size, Entry.Type)) + uint64_t SecSize = Entry.Size; + + // If the section is compressed, decompress it into a buffer + // DecompressBuf before reading the actual data. The pointee of + // 'Data' will be changed to buffer hold by DecompressBuf + // temporarily when reading the actual data. + bool isCompressed = hasSecFlag(Entry, SecFlagCompress); + if (isCompressed) { + const uint8_t *DecompressBuf; + uint64_t DecompressBufSize; + if (std::error_code EC = decompressSection( + SecStart, SecSize, DecompressBuf, DecompressBufSize)) + return EC; + SecStart = DecompressBuf; + SecSize = DecompressBufSize; + } + + if (std::error_code EC = readOneSection(SecStart, SecSize, Entry.Type)) return EC; - if (Data != SecStart + Entry.Size) + if (Data != SecStart + SecSize) return sampleprof_error::malformed; + + // Change the pointee of 'Data' from DecompressBuf to original Buffer. + if (isCompressed) { + Data = BufStart + Entry.Offset; + End = BufStart + Buffer->getBufferSize(); + } } return sampleprof_error::success; @@ -621,10 +664,10 @@ std::error_code SampleProfileReaderExtBinaryBase::readSecHdrTableEntry() { return EC; Entry.Type = static_cast(*Type); - auto Flag = readUnencodedNumber(); - if (std::error_code EC = Flag.getError()) + auto Flags = readUnencodedNumber(); + if (std::error_code EC = Flags.getError()) return EC; - Entry.Flag = *Flag; + Entry.Flags = *Flags; auto Offset = readUnencodedNumber(); if (std::error_code EC = Offset.getError()) diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp index 068ce5bf959cda..03446367665bdc 100644 --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ProfileData/ProfileCommon.h" #include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/Compression.h" #include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorOr.h" @@ -72,21 +73,58 @@ SampleProfileWriter::write(const StringMap &ProfileMap) { return sampleprof_error::success; } +SecHdrTableEntry & +SampleProfileWriterExtBinaryBase::getEntryInLayout(SecType Type) { + auto SecIt = std::find_if( + SectionLayout.begin(), SectionLayout.end(), + [=](const auto &Entry) -> bool { return Entry.Type == Type; }); + return *SecIt; +} + /// Return the current position and prepare to use it as the start /// position of a section. -uint64_t SampleProfileWriterExtBinaryBase::markSectionStart() { - return OutputStream->tell(); +uint64_t SampleProfileWriterExtBinaryBase::markSectionStart(SecType Type) { + uint64_t SectionStart = OutputStream->tell(); + auto &Entry = getEntryInLayout(Type); + // Use LocalBuf as a temporary output for writting data. + if (hasSecFlag(Entry, SecFlagCompress)) + LocalBufStream.swap(OutputStream); + return SectionStart; +} + +std::error_code SampleProfileWriterExtBinaryBase::compressAndOutput() { + if (!llvm::zlib::isAvailable()) + return sampleprof_error::zlib_unavailable; + std::string &UncompressedStrings = + static_cast(LocalBufStream.get())->str(); + if (UncompressedStrings.size() == 0) + return sampleprof_error::success; + auto &OS = *OutputStream; + SmallString<128> CompressedStrings; + llvm::Error E = zlib::compress(UncompressedStrings, CompressedStrings, + zlib::BestSizeCompression); + if (E) + return sampleprof_error::compress_failed; + encodeULEB128(UncompressedStrings.size(), OS); + encodeULEB128(CompressedStrings.size(), OS); + OS << CompressedStrings.str(); + UncompressedStrings.clear(); + return sampleprof_error::success; } -/// Add a new section into section header table. Return the position -/// of SectionEnd. -uint64_t -SampleProfileWriterExtBinaryBase::addNewSection(SecType Sec, +/// Add a new section into section header table. +std::error_code +SampleProfileWriterExtBinaryBase::addNewSection(SecType Type, uint64_t SectionStart) { - uint64_t SectionEnd = OutputStream->tell(); - SecHdrTable.push_back( - {Sec, 0, SectionStart - FileStart, SectionEnd - SectionStart}); - return SectionEnd; + auto Entry = getEntryInLayout(Type); + if (hasSecFlag(Entry, SecFlagCompress)) { + LocalBufStream.swap(OutputStream); + if (std::error_code EC = compressAndOutput()) + return EC; + } + SecHdrTable.push_back({Type, Entry.Flags, SectionStart - FileStart, + OutputStream->tell() - SectionStart}); + return sampleprof_error::success; } std::error_code SampleProfileWriterExtBinaryBase::write( @@ -94,6 +132,8 @@ std::error_code SampleProfileWriterExtBinaryBase::write( if (std::error_code EC = writeHeader(ProfileMap)) return EC; + std::string LocalBuf; + LocalBufStream = std::make_unique(LocalBuf); if (std::error_code EC = writeSections(ProfileMap)) return EC; @@ -105,28 +145,38 @@ std::error_code SampleProfileWriterExtBinaryBase::write( std::error_code SampleProfileWriterExtBinary::writeSections( const StringMap &ProfileMap) { - uint64_t SectionStart = markSectionStart(); + uint64_t SectionStart = markSectionStart(SecProfSummary); computeSummary(ProfileMap); if (auto EC = writeSummary()) return EC; - SectionStart = addNewSection(SecProfSummary, SectionStart); + if (std::error_code EC = addNewSection(SecProfSummary, SectionStart)) + return EC; // Generate the name table for all the functions referenced in the profile. + SectionStart = markSectionStart(SecNameTable); for (const auto &I : ProfileMap) { addName(I.first()); addNames(I.second); } writeNameTable(); - SectionStart = addNewSection(SecNameTable, SectionStart); + if (std::error_code EC = addNewSection(SecNameTable, SectionStart)) + return EC; + SectionStart = markSectionStart(SecLBRProfile); if (std::error_code EC = writeFuncProfiles(ProfileMap)) return EC; - SectionStart = addNewSection(SecLBRProfile, SectionStart); + if (std::error_code EC = addNewSection(SecLBRProfile, SectionStart)) + return EC; + + if (ProfSymList && ProfSymList->toCompress()) + setToCompressSection(SecProfileSymbolList); + SectionStart = markSectionStart(SecProfileSymbolList); if (ProfSymList && ProfSymList->size() > 0) if (std::error_code EC = ProfSymList->write(*OutputStream)) return EC; - addNewSection(SecProfileSymbolList, SectionStart); + if (std::error_code EC = addNewSection(SecProfileSymbolList, SectionStart)) + return EC; return sampleprof_error::success; } @@ -308,6 +358,23 @@ std::error_code SampleProfileWriterBinary::writeHeader( return sampleprof_error::success; } +void SampleProfileWriterExtBinaryBase::setToCompressAllSections() { + for (auto &Entry : SectionLayout) + addSecFlags(Entry, SecFlagCompress); +} + +void SampleProfileWriterExtBinaryBase::setToCompressSection(SecType Type) { + addSectionFlags(Type, SecFlagCompress); +} + +void SampleProfileWriterExtBinaryBase::addSectionFlags(SecType Type, + SecFlags Flags) { + for (auto &Entry : SectionLayout) { + if (Entry.Type == Type) + addSecFlags(Entry, Flags); + } +} + void SampleProfileWriterExtBinaryBase::allocSecHdrTable() { support::endian::Writer Writer(*OutputStream, support::little); @@ -342,9 +409,9 @@ std::error_code SampleProfileWriterExtBinaryBase::writeSecHdrTable() { // to adjust the order in SecHdrTable to be consistent with // SectionLayout when we write SecHdrTable to the memory. for (uint32_t i = 0; i < SectionLayout.size(); i++) { - uint32_t idx = IndexMap[static_cast(SectionLayout[i])]; + uint32_t idx = IndexMap[static_cast(SectionLayout[i].Type)]; Writer.write(static_cast(SecHdrTable[idx].Type)); - Writer.write(static_cast(SecHdrTable[idx].Flag)); + Writer.write(static_cast(SecHdrTable[idx].Flags)); Writer.write(static_cast(SecHdrTable[idx].Offset)); Writer.write(static_cast(SecHdrTable[idx].Size)); } @@ -362,7 +429,6 @@ std::error_code SampleProfileWriterExtBinaryBase::writeHeader( FileStart = OS.tell(); writeMagicIdent(Format); - initSectionLayout(); allocSecHdrTable(); return sampleprof_error::success; } diff --git a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp index 66eb06bbc4f5db..3a42fe0678eb1a 100644 --- a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp +++ b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp @@ -103,7 +103,7 @@ template <> struct MappingTraits { /// newlines in strings. struct StringBlockVal { StringRef Value; - StringBlockVal(const std::string &Value) : Value(Value) {} + StringBlockVal(StringRef R) : Value(R) {} }; template <> struct BlockScalarTraits { diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 25510fa58ff543..620f7ffd4c9fac 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -692,7 +692,7 @@ static inline bool ProvideOption(Option *Handler, StringRef ArgName, return false; } -static bool ProvidePositionalOption(Option *Handler, StringRef Arg, int i) { +bool llvm::cl::ProvidePositionalOption(Option *Handler, StringRef Arg, int i) { int Dummy = i; return ProvideOption(Handler, Handler->ArgStr, Arg, 0, nullptr, Dummy); } diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index 5509ec00886271..2a473a1994c2b0 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1512,6 +1512,17 @@ bool sys::getHostCPUFeatures(StringMap &Features) { return true; } +#elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64)) +bool sys::getHostCPUFeatures(StringMap &Features) { + if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) + Features["neon"] = true; + if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) + Features["crc"] = true; + if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) + Features["crypto"] = true; + + return true; +} #else bool sys::getHostCPUFeatures(StringMap &Features) { return false; } #endif diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp index bf97e37f324aa5..48ded6c45a4604 100644 --- a/llvm/lib/TableGen/Main.cpp +++ b/llvm/lib/TableGen/Main.cpp @@ -49,6 +49,9 @@ static cl::list MacroNames("D", cl::desc("Name of the macro to be defined"), cl::value_desc("macro name"), cl::Prefix); +static cl::opt +WriteIfChanged("write-if-changed", cl::desc("Only write output if it changed")); + static int reportError(const char *ProgName, Twine Msg) { errs() << ProgName << ": " << Msg; errs().flush(); @@ -99,23 +102,41 @@ int llvm::TableGenMain(char *argv0, TableGenMainFn *MainFn) { if (Parser.ParseFile()) return 1; - std::error_code EC; - ToolOutputFile Out(OutputFilename, EC, sys::fs::OF_None); - if (EC) - return reportError(argv0, "error opening " + OutputFilename + ":" + - EC.message() + "\n"); + // Write output to memory. + std::string OutString; + raw_string_ostream Out(OutString); + if (MainFn(Out, Records)) + return 1; + + // Always write the depfile, even if the main output hasn't changed. + // If it's missing, Ninja considers the output dirty. If this was below + // the early exit below and someone deleted the .inc.d file but not the .inc + // file, tablegen would never write the depfile. if (!DependFilename.empty()) { if (int Ret = createDependencyFile(Parser, argv0)) return Ret; } - if (MainFn(Out.os(), Records)) - return 1; + if (WriteIfChanged) { + // Only updates the real output file if there are any differences. + // This prevents recompilation of all the files depending on it if there + // aren't any. + if (auto ExistingOrErr = MemoryBuffer::getFile(OutputFilename)) + if (std::move(ExistingOrErr.get())->getBuffer() == Out.str()) + return 0; + } + + std::error_code EC; + ToolOutputFile OutFile(OutputFilename, EC, sys::fs::OF_None); + if (EC) + return reportError(argv0, "error opening " + OutputFilename + ":" + + EC.message() + "\n"); + OutFile.os() << Out.str(); if (ErrorsPrinted > 0) return reportError(argv0, Twine(ErrorsPrinted) + " errors.\n"); // Declare success. - Out.keep(); + OutFile.keep(); return 0; } diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 6689ee48200e26..51bf35d4a161fd 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -406,6 +406,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td new file mode 100644 index 00000000000000..c4658f73b8ddcd --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -0,0 +1,15 @@ +//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/GlobalISel/Combine.td" + +def AArch64PreLegalizerCombinerHelper: GICombinerHelper< + "AArch64GenPreLegalizerCombinerHelper">; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 8357b763179d2b..c42c16bc1aad43 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -55,6 +55,10 @@ // | callee-saved fp/simd/SVE regs | // | | // |-----------------------------------| +// | | +// | SVE stack objects | +// | | +// |-----------------------------------| // |.empty.space.to.make.part.below....| // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at // |.the.standard.16-byte.alignment....| compile time; if present) @@ -202,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static StackOffset getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8}; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -214,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -456,6 +467,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (canUseRedZone(MF)) return false; + // When there is an SVE area on the stack, always allocate the + // callee-saves and spills/locals separately. + if (getSVEStackSize(MF)) + return false; + return true; } @@ -870,6 +886,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Ideally it should match SP value after prologue. AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -880,6 +898,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && + "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -926,6 +946,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); @@ -1083,6 +1104,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1431,8 +1455,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); @@ -1446,6 +1473,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Deallocate the SVE area. + if (SVEStackSize) + if (!AFI->isStackRealigned()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, + TII, MachineInstr::FrameDestroy); + if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the @@ -1595,6 +1628,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + if (SVEStackSize) + llvm_unreachable("Accessing frame indices in presence of SVE " + "not yet supported"); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the @@ -2175,8 +2213,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); + bool HasSVEStackObjects = [&MFI]() { + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == TargetStackID::SVEVector && + MFI.getObjectOffset(I) < 0) + return true; + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return false; + }(); + // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + bool CanEliminateFrame = (SavedRegs.count() == 0) && !HasSVEStackObjects; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. @@ -2239,12 +2288,34 @@ bool AArch64FrameLowering::enableStackSlotScavenging( void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + // Process all fixed stack SVE objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID == TargetStackID::SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) + Offset = FixedOffset; + } + } + + unsigned MaxAlign = getStackAlignment(); + uint64_t SVEStackSize = alignTo(Offset, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo(); + AFI->setStackSizeSVE(SVEStackSize); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 7ed20d24607fb6..99d868a95a7073 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -87,6 +87,17 @@ class AArch64FrameLowering : public TargetFrameLowering { int FI) const override; int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; + bool isSupportedStackID(TargetStackID::Value ID) const override { + switch (ID) { + default: + return false; + case TargetStackID::Default: + case TargetStackID::SVEVector: + case TargetStackID::NoAlloc: + return true; + } + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 097a8ba0ae19a5..1cc3177b26a7f4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3046,6 +3046,16 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MaxEncoding = 0xfff; ShiftSize = 12; break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; default: llvm_unreachable("Unsupported opcode"); } @@ -3117,8 +3127,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI, bool *HasWinCFI) { - int64_t Bytes; - Offset.getForFrameOffset(Bytes); + int64_t Bytes, NumPredicateVectors, NumDataVectors; + Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (!Offset && SrcReg != DestReg)) { @@ -3133,6 +3143,23 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, NeedsWinCFI, HasWinCFI); SrcReg = DestReg; } + + assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && + "WinCFI not supported with SVE vectors"); + + if (NumDataVectors) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); + SrcReg = DestReg; + } + + if (NumPredicateVectors) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); + } } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0efeeb272ec1f9..a7d0a742573d7f 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -95,6 +95,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// returned struct in a register. This field holds the virtual register into /// which the sret argument is passed. unsigned SRetReturnReg = 0; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. @@ -131,6 +138,15 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { ArgumentStackToRestore = bytes; } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + void setStackSizeSVE(uint64_t S) { + HasCalculatedStackSizeSVE = true; + StackSizeSVE = S; + } + + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 6df3f944f8c34b..bea75b83517bbb 100644 --- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -27,12 +27,22 @@ using namespace llvm; using namespace MIPatternMatch; +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + namespace { +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + class AArch64PreLegalizerCombinerInfo : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; public: + AArch64GenPreLegalizerCombinerHelper Generated; + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, @@ -81,9 +91,16 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, } } + if (Generated.tryCombineAll(Observer, MI, B)) + return true; + return false; } +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + // Pass boilerplate // ================ diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index cdf313db1b9c1a..1657a76a685cce 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -97,11 +97,11 @@ let Predicates = [HasSVE] in { defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>; defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>; - defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">; - defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">; - defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">; - defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">; - defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">; + defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", null_frag>; + defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", null_frag>; + defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>; + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", null_frag>; + defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", null_frag>; defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">; @@ -138,12 +138,12 @@ let Predicates = [HasSVE] in { defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">; defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">; - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">; - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">; + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>; + defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>; + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>; defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">; diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h index 5f5cdfa2fad1cb..13f12a6c9c30a4 100644 --- a/llvm/lib/Target/AArch64/AArch64StackOffset.h +++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h @@ -35,32 +35,38 @@ namespace llvm { /// vector and a 64bit GPR. class StackOffset { int64_t Bytes; + int64_t ScalableBytes; explicit operator int() const; public: using Part = std::pair; - StackOffset() : Bytes(0) {} + StackOffset() : Bytes(0), ScalableBytes(0) {} StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { - assert(!MVT(T).isScalableVector() && "Scalable types not supported"); + assert(MVT(T).getSizeInBits() % 8 == 0 && + "Offset type is not a multiple of bytes"); *this += Part(Offset, T); } - StackOffset(const StackOffset &Other) : Bytes(Other.Bytes) {} + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {} StackOffset &operator=(const StackOffset &) = default; StackOffset &operator+=(const StackOffset::Part &Other) { - assert(Other.second.getSizeInBits() % 8 == 0 && - "Offset type is not a multiple of bytes"); - Bytes += Other.first * (Other.second.getSizeInBits() / 8); + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; return *this; } StackOffset &operator+=(const StackOffset &Other) { Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; return *this; } @@ -72,6 +78,7 @@ class StackOffset { StackOffset &operator-=(const StackOffset &Other) { Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; return *this; } @@ -88,16 +95,42 @@ class StackOffset { return Res; } + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + /// Returns the non-scalable part of the offset in bytes. int64_t getBytes() const { return Bytes; } /// Returns the offset in parts to which this frame offset can be /// decomposed for the purpose of describing a frame offset. /// For non-scalable offsets this is simply its byte size. - void getForFrameOffset(int64_t &ByteSized) const { ByteSized = Bytes; } + void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors, + int64_t &NumDataVectors) const { + assert(isValid() && "Invalid frame offset"); + + NumBytes = Bytes; + NumDataVectors = 0; + NumPredicateVectors = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into NumDataVectors so that it + // uses ADDVL for part of it, reducing the number of ADDPL instructions. + if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || + NumPredicateVectors > 62) { + NumDataVectors = NumPredicateVectors / 8; + NumPredicateVectors -= NumDataVectors * 8; + } + } /// Returns whether the offset is known zero. - explicit operator bool() const { return Bytes; } + explicit operator bool() const { return Bytes || ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 7af0de7f647e02..4fb409f020d91a 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -5276,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) { auto parseOp = [&]() -> bool { SMLoc L = getLoc(); - const MCExpr *Expr; + const MCExpr *Expr = nullptr; if (check(getParser().parseExpression(Expr), L, "expected expression")) return true; const MCConstantExpr *Value = dyn_cast_or_null(Expr); diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 0da057ea99736b..103925d45d5104 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -8,6 +8,8 @@ tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) +tablegen(LLVM AArch64GenGICombiner.inc -gen-global-isel-combiner + -combiners="AArch64PreLegalizerCombinerHelper") tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info) tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index d0a544273b8bbc..1a16468484adab 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -172,7 +172,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, int ImmS = MI->getOperand(4).getImm(); if ((Op2.getReg() == AArch64::WZR || Op2.getReg() == AArch64::XZR) && - (ImmR == 0 || ImmS < ImmR)) { + (ImmR == 0 || ImmS < ImmR) && + STI.getFeatureBits()[AArch64::HasV8_2aOps]) { // BFC takes precedence over its entire range, sligtly differently to BFI. int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; int LSB = (BitWidth - ImmR) % BitWidth; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index f57e111b7e1336..e2bd47ee6ae31e 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -1219,10 +1219,12 @@ multiclass sve_fp_ftmad { //===----------------------------------------------------------------------===// class sve_fp_3op_u_zd sz, bits<3> opc, string asm, - ZPRRegOp zprty> + ZPRRegOp zprty, + ValueType vt, ValueType vt2, SDPatternOperator op> : I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", - "", []>, Sched<[]> { + "", + [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> { bits<5> Zd; bits<5> Zm; bits<5> Zn; @@ -1236,10 +1238,10 @@ class sve_fp_3op_u_zd sz, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_fp_3op_u_zd opc, string asm> { - def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_u_zd opc, string asm, SDPatternOperator op> { + def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>; + def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>; + def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>; } //===----------------------------------------------------------------------===// @@ -2876,11 +2878,21 @@ multiclass sve_int_un_pred_arit_0_d opc, string asm> { def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; } -multiclass sve_int_un_pred_arit_1 opc, string asm> { +multiclass sve_int_un_pred_arit_1 opc, string asm, + SDPatternOperator op> { def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>; def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>; def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } multiclass sve_int_un_pred_arit_1_fp opc, string asm> { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 6ee11686f4859c..f2be1ca44d3468 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -116,6 +116,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; class GISelSop2Pat < SDPatternOperator node, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b2491ebc6f48fd..c74a361b2c7125 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -186,10 +186,11 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, SDValue &DLC) const; + SDValue &SLC, SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -202,7 +203,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, @@ -1313,7 +1314,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { // Subtarget prefers to use flat instruction if (Subtarget->useFlatForGlobal()) return false; @@ -1326,6 +1328,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1405,7 +1408,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, - SDValue &DLC) const { + SDValue &DLC, SDValue &SWZ) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1413,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; ConstantSDNode *C = cast(Addr64); @@ -1435,9 +1438,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1562,13 +1565,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; if (!cast(Offen)->getSExtValue() && @@ -1590,16 +1594,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE, DLC; + SDValue GLC, SLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } template diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5480eb5595a5da..afdeacc42910ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -434,8 +434,11 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); - assert(I.getOperand(2).getImm() % 32 == 0); - unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32); + unsigned Offset = I.getOperand(2).getImm(); + if (Offset % 32 != 0) + return false; + + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32); const DebugLoc &DL = I.getDebugLoc(); MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), I.getOperand(0).getReg()) @@ -554,37 +557,53 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); + + Register DstReg = I.getOperand(0).getReg(); Register Src0Reg = I.getOperand(1).getReg(); Register Src1Reg = I.getOperand(2).getReg(); LLT Src1Ty = MRI->getType(Src1Reg); - if (Src1Ty.getSizeInBits() != 32) - return false; + + unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); + unsigned InsSize = Src1Ty.getSizeInBits(); int64_t Offset = I.getOperand(3).getImm(); if (Offset % 32 != 0) return false; - unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32); - const DebugLoc &DL = I.getDebugLoc(); + unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); + if (SubReg == AMDGPU::NoSubRegister) + return false; - MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) - .addDef(I.getOperand(0).getReg()) - .addReg(Src0Reg) - .addReg(Src1Reg) - .addImm(SubReg); + const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); + const TargetRegisterClass *DstRC = + TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI); + if (!DstRC) + return false; - for (const MachineOperand &MO : Ins->operands()) { - if (!MO.isReg()) - continue; - if (Register::isPhysicalRegister(MO.getReg())) - continue; + const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); + const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); + const TargetRegisterClass *Src0RC = + TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI); + const TargetRegisterClass *Src1RC = + TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI); + + // Deal with weird cases where the class only partially supports the subreg + // index. + Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); + if (!Src0RC) + return false; + + if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || + !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) + return false; + + const DebugLoc &DL = I.getDebugLoc(); + BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) + .addReg(Src0Reg) + .addReg(Src1Reg) + .addImm(SubReg); - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, *MRI); - if (!RC) - continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); - } I.eraseFromParent(); return true; } @@ -762,16 +781,20 @@ static bool isZero(Register Reg, MachineRegisterInfo &MRI) { return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; } -static unsigned extractGLC(unsigned CachePolicy) { - return CachePolicy & 1; +static unsigned extractGLC(unsigned AuxiliaryData) { + return AuxiliaryData & 1; +} + +static unsigned extractSLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 1) & 1; } -static unsigned extractSLC(unsigned CachePolicy) { - return (CachePolicy >> 1) & 1; +static unsigned extractDLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 2) & 1; } -static unsigned extractDLC(unsigned CachePolicy) { - return (CachePolicy >> 2) & 1; +static unsigned extractSWZ(unsigned AuxiliaryData) { + return (AuxiliaryData >> 3) & 1; } // Returns Base register, constant offset, and offset def point. @@ -970,7 +993,7 @@ bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, Register RSrc = MI.getOperand(2).getReg(); Register VOffset = MI.getOperand(3).getReg(); Register SOffset = MI.getOperand(4).getReg(); - unsigned CachePolicy = MI.getOperand(5).getImm(); + unsigned AuxiliaryData = MI.getOperand(5).getImm(); unsigned ImmOffset; unsigned TotalOffset; @@ -994,10 +1017,11 @@ bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, MIB.addUse(RSrc) .addUse(SOffset) .addImm(ImmOffset) - .addImm(extractGLC(CachePolicy)) - .addImm(extractSLC(CachePolicy)) + .addImm(extractGLC(AuxiliaryData)) + .addImm(extractSLC(AuxiliaryData)) .addImm(0) // tfe: FIXME: Remove from inst - .addImm(extractDLC(CachePolicy)) + .addImm(extractDLC(AuxiliaryData)) + .addImm(extractSWZ(AuxiliaryData)) .addMemOperand(MMO); MI.eraseFromParent(); @@ -1406,31 +1430,38 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - DebugLoc DL = I.getDebugLoc(); - const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass : - &AMDGPU::VGPR_32RegClass; - Register LoReg = MRI->createVirtualRegister(RC); - Register HiReg = MRI->createVirtualRegister(RC); - const APInt &Imm = APInt(Size, I.getOperand(1).getImm()); - - BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) - .addImm(Imm.trunc(32).getZExtValue()); + const DebugLoc &DL = I.getDebugLoc(); - BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) - .addImm(Imm.ashr(32).getZExtValue()); + APInt Imm(Size, I.getOperand(1).getImm()); - const MachineInstr *RS = - BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(LoReg) - .addImm(AMDGPU::sub0) - .addReg(HiReg) - .addImm(AMDGPU::sub1); + MachineInstr *ResInst; + if (IsSgpr && TII.isInlineConstant(Imm)) { + ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) + .addImm(I.getOperand(1).getImm()); + } else { + const TargetRegisterClass *RC = IsSgpr ? + &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass; + Register LoReg = MRI->createVirtualRegister(RC); + Register HiReg = MRI->createVirtualRegister(RC); + + BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) + .addImm(Imm.trunc(32).getZExtValue()); + + BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) + .addImm(Imm.ashr(32).getZExtValue()); + + ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + } // We can't call constrainSelectedInstRegOperands here, because it doesn't // work for target independent opcodes I.eraseFromParent(); const TargetRegisterClass *DstRC = - TRI.getConstrainedRegClassForOperand(RS->getOperand(0), *MRI); + TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); if (!DstRC) return true; return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); @@ -1642,7 +1673,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); - if (!isPreISelGenericOpcode(I.getOpcode())) { + if (!I.isPreISelOpcode()) { if (I.isCopy()) return selectCOPY(I); return true; @@ -1665,6 +1696,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectG_UADDO_USUBO(I); case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: + case TargetOpcode::G_PTRTOINT: return selectCOPY(I); case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b8b54a2ef1a5e3..dfb8ed55d6b571 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -48,12 +48,19 @@ static LegalityPredicate isMultiple32(unsigned TypeIdx, }; } +static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { + return [=](const LegalityQuery &Query) { + return Query.Types[TypeIdx].getSizeInBits() == Size; + }; +} + static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; return Ty.isVector() && Ty.getNumElements() % 2 != 0 && - Ty.getElementType().getSizeInBits() < 32; + Ty.getElementType().getSizeInBits() < 32 && + Ty.getSizeInBits() % 32 != 0; }; } @@ -268,7 +275,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) .clampScalar(0, S32, S64) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) + .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) .widenScalarToNextPow2(0) .scalarize(0); @@ -279,11 +286,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); // TODO: Implement. getActionDefinitionsBuilder(G_BITCAST) - .legalForCartesianProduct({S32, V2S16}) - .legalForCartesianProduct({S64, V2S32, V4S16}) - .legalForCartesianProduct({V2S64, V4S32}) // Don't worry about the size constraint. - .legalIf(all(isPointer(0), isPointer(1))) + .legalIf(all(isRegisterType(0), isRegisterType(1))) // FIXME: Testing hack .legalForCartesianProduct({S16, LLT::vector(2, 8), }); @@ -833,6 +837,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_ATOMICRMW_FADD) .legalFor({{S32, LocalPtr}}); + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) + .lower(); + // TODO: Pointer types, any 32-bit or 64-bit vector getActionDefinitionsBuilder(G_SELECT) .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, @@ -906,6 +913,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Doesn't handle extract of illegal sizes. getActionDefinitionsBuilder(Op) + .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) + // FIXME: Multiples of 16 should not be legal. .legalIf([=](const LegalityQuery &Query) { const LLT BigTy = Query.Types[BigTyIdx]; const LLT LitTy = Query.Types[LitTyIdx]; @@ -975,7 +984,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return false; }; - getActionDefinitionsBuilder(Op) + auto &Builder = getActionDefinitionsBuilder(Op) .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) // Clamp the little scalar to s8-s256 and make it a power of 2. It's not // worth considering the multiples of 64 since 2*192 and 2*384 are not @@ -994,25 +1003,36 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, scalarize(1)) .clampScalar(BigTyIdx, S32, S1024) - .lowerFor({{S16, V2S16}}) - .widenScalarIf( + .lowerFor({{S16, V2S16}}); + + if (Op == G_MERGE_VALUES) { + Builder.widenScalarIf( + // TODO: Use 16-bit shifts if legal for 8-bit values? [=](const LegalityQuery &Query) { - const LLT &Ty = Query.Types[BigTyIdx]; - return !isPowerOf2_32(Ty.getSizeInBits()) && - Ty.getSizeInBits() % 16 != 0; + const LLT Ty = Query.Types[LitTyIdx]; + return Ty.getSizeInBits() < 32; }, - [=](const LegalityQuery &Query) { - // Pick the next power of 2, or a multiple of 64 over 128. - // Whichever is smaller. - const LLT &Ty = Query.Types[BigTyIdx]; - unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); - if (NewSizeInBits >= 256) { - unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); - if (RoundedTo < NewSizeInBits) - NewSizeInBits = RoundedTo; - } - return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); - }) + changeTo(LitTyIdx, S32)); + } + + Builder.widenScalarIf( + [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[BigTyIdx]; + return !isPowerOf2_32(Ty.getSizeInBits()) && + Ty.getSizeInBits() % 16 != 0; + }, + [=](const LegalityQuery &Query) { + // Pick the next power of 2, or a multiple of 64 over 128. + // Whichever is smaller. + const LLT &Ty = Query.Types[BigTyIdx]; + unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); + if (NewSizeInBits >= 256) { + unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); + if (RoundedTo < NewSizeInBits) + NewSizeInBits = RoundedTo; + } + return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); + }) .legalIf([=](const LegalityQuery &Query) { const LLT &BigTy = Query.Types[BigTyIdx]; const LLT &LitTy = Query.Types[LitTyIdx]; @@ -1087,6 +1107,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( const GCNSubtarget &ST = MF.getSubtarget(); const LLT S32 = LLT::scalar(32); + assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); + if (ST.hasApertureRegs()) { // FIXME: Use inline constants (src_{shared, private}_base) instead of // getreg. @@ -1233,7 +1255,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - Register ApertureReg = getSegmentAperture(DestAS, MRI, B); + Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); if (!ApertureReg.isValid()) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 261d6287763f4d..5250bf455d7192 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -30,7 +30,6 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -45,20 +44,13 @@ using namespace llvm; namespace { class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final - : public ModulePass, - public InstVisitor { + : public ModulePass { public: static char ID; explicit AMDGPUPrintfRuntimeBinding(); - void visitCallSite(CallSite CS) { - Function *F = CS.getCalledFunction(); - if (F && F->hasName() && F->getName() == "printf") - Printfs.push_back(CS.getInstruction()); - } - private: bool runOnModule(Module &M) override; void getConversionSpecifiers(SmallVectorImpl &OpConvSpecifiers, @@ -80,7 +72,7 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final const DataLayout *TD; const DominatorTree *DT; - SmallVector Printfs; + SmallVector Printfs; }; } // namespace @@ -162,8 +154,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( // NB: This is important for this string size to be divizable by 4 const char NonLiteralStr[4] = "???"; - for (auto P : Printfs) { - auto CI = cast(P); + for (auto CI : Printfs) { unsigned NumOps = CI->getNumArgOperands(); SmallString<16> OpConvSpecifiers; @@ -564,10 +555,8 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu( } // erase the printf calls - for (auto P : Printfs) { - auto CI = cast(P); + for (auto CI : Printfs) CI->eraseFromParent(); - } Printfs.clear(); return true; @@ -578,7 +567,16 @@ bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) { if (TT.getArch() == Triple::r600) return false; - visit(M); + auto PrintfFunction = M.getFunction("printf"); + if (!PrintfFunction) + return false; + + for (auto &U : PrintfFunction->uses()) { + if (auto *CI = dyn_cast(U.getUser())) { + if (CI->isCallee(&U)) + Printfs.push_back(CI); + } + } if (Printfs.empty()) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index badcd77aaef1fe..9446814c8f8181 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -17,7 +17,6 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -308,16 +307,16 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should have no register for immediate - static const OpRegBankEntry<2> Table[2] = { + static const OpRegBankEntry<1> Table[2] = { // Perfectly legal. - { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, + { { AMDGPU::SGPRRegBankID }, 1 }, // Need readlane - { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } + { { AMDGPU::VGPRRegBankID }, 3 } }; - const std::array RegSrcOpIdx = { { 1, 2 } }; - return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); + const std::array RegSrcOpIdx = { { 2 } }; + return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); } default: return RegisterBankInfo::getInstrAlternativeMappings(MI); @@ -353,7 +352,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( { { AMDGPU::SCCRegBankID }, 1 } }; - return addMappingFromTable<1>(MI, MRI, { 0 }, Table); + return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); } LLVM_FALLTHROUGH; @@ -659,56 +658,51 @@ static LLT getHalfSizedType(LLT Ty) { /// unique values used. bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineIRBuilder &B, - MachineInstr &MI, - MachineRegisterInfo &MRI, - ArrayRef OpIndices) const { - MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineBasicBlock::iterator I(MI); - - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - // Use a set to avoid extra readfirstlanes in the case where multiple operands - // are the same register. - SmallSet SGPROperandRegs; - for (unsigned Op : OpIndices) { - assert(MI.getOperand(Op).isUse()); - Register Reg = MI.getOperand(Op).getReg(); - const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); - if (OpBank->getID() == AMDGPU::VGPRRegBankID) - SGPROperandRegs.insert(Reg); - } - - // No operands need to be replaced, so no need to loop. - if (SGPROperandRegs.empty()) - return false; - + iterator_range Range, + SmallSet &SGPROperandRegs, + MachineRegisterInfo &MRI) const { SmallVector ResultRegs; SmallVector InitResultRegs; SmallVector PhiRegs; - for (MachineOperand &Def : MI.defs()) { - LLT ResTy = MRI.getType(Def.getReg()); - const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); - ResultRegs.push_back(Def.getReg()); - Register InitReg = B.buildUndef(ResTy).getReg(0); - Register PhiReg = MRI.createGenericVirtualRegister(ResTy); - InitResultRegs.push_back(InitReg); - PhiRegs.push_back(PhiReg); - MRI.setRegBank(PhiReg, *DefBank); - MRI.setRegBank(InitReg, *DefBank); + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction *MF = &B.getMF(); + + const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); + const unsigned WaveAndOpc = Subtarget.isWave32() ? + AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + const unsigned MovTermOpc = Subtarget.isWave32() ? + AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + const unsigned XorTermOpc = Subtarget.isWave32() ? + AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; + const unsigned AndSaveExecOpc = Subtarget.isWave32() ? + AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; + const unsigned ExecReg = Subtarget.isWave32() ? + AMDGPU::EXEC_LO : AMDGPU::EXEC; + + for (MachineInstr &MI : Range) { + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + Register InitReg = B.buildUndef(ResTy).getReg(0); + Register PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } } - Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + Register SaveExecReg = MRI.createVirtualRegister(WaveRC); + Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); // Don't bother using generic instructions/registers for the exec mask. B.buildInstr(TargetOpcode::IMPLICIT_DEF) .addDef(InitSaveExecReg); - Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + Register PhiExec = MRI.createVirtualRegister(WaveRC); + Register NewExec = MRI.createVirtualRegister(WaveRC); // To insert the loop we need to split the block. Move everything before this // point to a new block, and insert a new empty block before this instruction. @@ -726,7 +720,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // Move the rest of the block into a new block. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); MBB.addSuccessor(LoopBB); RestoreExecBB->addSuccessor(RemainderBB); @@ -749,164 +743,173 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); } - // Move the instruction into the loop. - LoopBB->splice(LoopBB->end(), &MBB, I); - I = std::prev(LoopBB->end()); + const DebugLoc &DL = B.getDL(); + + // Figure out the iterator range after splicing the instructions. + auto NewBegin = std::prev(LoopBB->end()); - B.setInstr(*I); + // Move the instruction into the loop. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); + + auto NewEnd = LoopBB->end(); + + MachineBasicBlock::iterator I = Range.begin(); + B.setInsertPt(*LoopBB, I); Register CondReg; - for (MachineOperand &Op : MI.uses()) { - if (!Op.isReg()) - continue; + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg() || Op.isDef()) + continue; - assert(!Op.isDef()); - if (SGPROperandRegs.count(Op.getReg())) { - LLT OpTy = MRI.getType(Op.getReg()); - unsigned OpSize = OpTy.getSizeInBits(); - - // Can only do a readlane of 32-bit pieces. - if (OpSize == 32) { - // Avoid extra copies in the simple case of one 32-bit register. - Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MRI.setType(CurrentLaneOpReg, OpTy); - - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) - .addReg(Op.getReg()); - - Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; - - // Compare the just read M0 value to all possible Idx values. - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(Op.getReg()); - Op.setReg(CurrentLaneOpReg); - - if (!First) { - Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } - } else { - LLT S32 = LLT::scalar(32); - SmallVector ReadlanePieces; + if (SGPROperandRegs.count(Op.getReg())) { + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); - bool Is64 = OpSize % 64 == 0; + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); - LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); - unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 - : AMDGPU::V_CMP_EQ_U32_e64; + Register NewCondReg = MRI.createVirtualRegister(WaveRC); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); - // Insert the unmerge before the loop. + if (!First) { + Register AndReg = MRI.createVirtualRegister(WaveRC); - B.setMBB(MBB); - auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); - B.setInstr(*I); + // If there are multiple operands to consider, and the conditions. + B.buildInstr(WaveAndOpc) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } else { + LLT S32 = LLT::scalar(32); + SmallVector ReadlanePieces; - unsigned NumPieces = Unmerge->getNumOperands() - 1; - for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { - Register UnmergePiece = Unmerge.getReg(PieceIdx); + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. - Register CurrentLaneOpReg; - if (Is64) { - Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); - Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + bool Is64 = OpSize % 64 == 0; - MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); - MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); - MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegLo) - .addReg(UnmergePiece, 0, AMDGPU::sub0); + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegHi) - .addReg(UnmergePiece, 0, AMDGPU::sub1); + // Insert the unmerge before the loop. - CurrentLaneOpReg = - B.buildMerge(LLT::scalar(64), - {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) - .getReg(0); + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + Register UnmergePiece = Unmerge.getReg(PieceIdx); - if (OpTy.getScalarSizeInBits() == 64) { - // If we need to produce a 64-bit element vector, so use the - // merged pieces - ReadlanePieces.push_back(CurrentLaneOpReg); + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); + + CurrentLaneOpReg = + B.buildMerge(LLT::scalar(64), + {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) + .getReg(0); + + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } } else { - // 32-bit element type. - ReadlanePieces.push_back(CurrentLaneOpRegLo); - ReadlanePieces.push_back(CurrentLaneOpRegHi); + CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); } - } else { - CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(UnmergePiece); - ReadlanePieces.push_back(CurrentLaneOpReg); - } + Register NewCondReg = MRI.createVirtualRegister(WaveRC); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; - Register NewCondReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; + B.buildInstr(CmpOp) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(UnmergePiece); - B.buildInstr(CmpOp) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(UnmergePiece); + if (!First) { + Register AndReg = MRI.createVirtualRegister(WaveRC); - if (!First) { - Register AndReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + // If there are multiple operands to consider, and the conditions. + B.buildInstr(WaveAndOpc) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); } - } - // FIXME: Build merge seems to switch to CONCAT_VECTORS but not - // BUILD_VECTOR - if (OpTy.isVector()) { - auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - } else { - auto Merge = B.buildMerge(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); } - - MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); } } } @@ -914,16 +917,16 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*LoopBB, LoopBB->end()); // Update EXEC, save the original EXEC value to VCC. - B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64) + B.buildInstr(AndSaveExecOpc) .addDef(NewExec) .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - B.buildInstr(AMDGPU::S_XOR_B64_term) - .addDef(AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) + B.buildInstr(XorTermOpc) + .addDef(ExecReg) + .addReg(ExecReg) .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use @@ -934,13 +937,13 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); // Save the EXEC mask before the loop. - BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) + .addReg(ExecReg); // Restore the EXEC mask after the loop. B.setMBB(*RestoreExecBB); - B.buildInstr(AMDGPU::S_MOV_B64_term) - .addDef(AMDGPU::EXEC) + B.buildInstr(MovTermOpc) + .addDef(ExecReg) .addReg(SaveExecReg); // Restore the insert point before the original instruction. @@ -949,6 +952,40 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( return true; } +// Return any unique registers used by \p MI at \p OpIndices that need to be +// handled in a waterfall loop. Returns these registers in \p +// SGPROperandRegs. Returns true if there are any operansd to handle and a +// waterfall loop is necessary. +bool AMDGPURegisterBankInfo::collectWaterfallOperands( + SmallSet &SGPROperandRegs, MachineInstr &MI, + MachineRegisterInfo &MRI, ArrayRef OpIndices) const { + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + Register Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + return !SGPROperandRegs.empty(); +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef OpIndices) const { + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet SGPROperandRegs; + + if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) + return false; + + MachineBasicBlock::iterator I = MI.getIterator(); + return executeInWaterfallLoop(B, make_range(I, std::next(I)), + SGPROperandRegs, MRI); +} + bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef OpIndices) const { @@ -1551,7 +1588,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( if (DstTy != LLT::vector(2, 16)) break; - assert(MI.getNumOperands() == 3 && empty(OpdMapper.getVRegs(0))); + assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); substituteSimpleCopyRegs(OpdMapper, 1); substituteSimpleCopyRegs(OpdMapper, 2); @@ -1604,10 +1641,140 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } - case AMDGPU::G_EXTRACT_VECTOR_ELT: - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { 2 }); + case AMDGPU::G_EXTRACT_VECTOR_ELT: { + SmallVector DstRegs(OpdMapper.getVRegs(0)); + + assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); + + if (DstRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register IdxReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstReg); + (void)DstTy; + + assert(DstTy.getSizeInBits() == 64); + + LLT SrcTy = MRI.getType(SrcReg); + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); + B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); + + const ValueMapping &DstMapping + = OpdMapper.getInstrMapping().getOperandMapping(0); + + // FIXME: Should be getting from mapping or not? + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + SmallSet OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { + MI.eraseFromParent(); + return; + } + + // Remove the original instruction to avoid potentially confusing the + // waterfall loop logic. + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); return; + } + case AMDGPU::G_INSERT_VECTOR_ELT: { + SmallVector InsRegs(OpdMapper.getVRegs(2)); + + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(1).empty()); + assert(OpdMapper.getVRegs(3).empty()); + + if (InsRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 3 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register InsReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT InsTy = MRI.getType(InsReg); + (void)InsTy; + + assert(InsTy.getSizeInBits() == 64); + + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + + auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); + auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); + B.buildBitcast(DstReg, InsHi); + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI); + + MRI.setRegBank(InsReg, *InsSrcBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(InsLo.getReg(0), *DstBank); + MRI.setRegBank(InsHi.getReg(0), *DstBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + + SmallSet OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { + MI.eraseFromParent(); + return; + } + + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + return; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { @@ -1618,8 +1785,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_readlane: { substituteSimpleCopyRegs(OpdMapper, 2); - assert(empty(OpdMapper.getVRegs(0))); - assert(empty(OpdMapper.getVRegs(3))); + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(3).empty()); // Make sure the index is an SGPR. It doesn't make sense to run this in a // waterfall loop, so assume it's a uniform value. @@ -1627,9 +1794,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case Intrinsic::amdgcn_writelane: { - assert(empty(OpdMapper.getVRegs(0))); - assert(empty(OpdMapper.getVRegs(2))); - assert(empty(OpdMapper.getVRegs(3))); + assert(OpdMapper.getVRegs(0).empty()); + assert(OpdMapper.getVRegs(2).empty()); + assert(OpdMapper.getVRegs(3).empty()); substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val constrainOpWithReadfirstlane(MI, MRI, 2); // Source value @@ -1651,11 +1818,26 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { // This is only allowed to execute with 1 lane, so readfirstlane is safe. - assert(empty(OpdMapper.getVRegs(0))); + assert(OpdMapper.getVRegs(0).empty()); substituteSimpleCopyRegs(OpdMapper, 3); constrainOpWithReadfirstlane(MI, MRI, 2); // M0 return; } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_br: { + // Only the first lane is executes, so readfirstlane is safe. + substituteSimpleCopyRegs(OpdMapper, 1); + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + // Only the first lane is executes, so readfirstlane is safe. + constrainOpWithReadfirstlane(MI, MRI, 1); // M0 + return; + } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? @@ -2123,11 +2305,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FCANONICALIZE: case AMDGPU::G_INTRINSIC_TRUNC: case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_AMDGPU_FFBH_U32: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { - if (MF.getSubtarget().hasScalarMulHiInsts() && - isSALUMapping(MI)) + if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); } @@ -2301,7 +2483,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { Op3Bank == AMDGPU::SGPRRegBankID && (Size == 32 || (Size == 64 && (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && - MF.getSubtarget().hasScalarCompareEq64())); + Subtarget.hasScalarCompareEq64())); unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; @@ -2312,14 +2494,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_EXTRACT_VECTOR_ELT: { - unsigned OutputBankID = isSALUMapping(MI) ? - AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + // VGPR index can be used for waterfall when indexing a SGPR vector. + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); // The index can be either if the source vector is VGPR. OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); @@ -2332,15 +2516,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); - unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), + MRI, *TRI); + unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); + OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, + InsertSize); // The index can be either if the source vector is VGPR. - OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); break; } case AMDGPU::G_UNMERGE_VALUES: { @@ -2389,6 +2576,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_mbcnt_hi: case Intrinsic::amdgcn_ubfe: case Intrinsic::amdgcn_sbfe: + case Intrinsic::amdgcn_mul_u24: + case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_lerp: case Intrinsic::amdgcn_sad_u8: case Intrinsic::amdgcn_msad_u8: @@ -2609,7 +2798,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // This must be an SGPR, but accept a VGPR. unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, AMDGPU::SGPRRegBankID); - OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); break; } @@ -2673,6 +2861,26 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_br: { + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); + break; + } + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + // This must be an SGPR, but accept a VGPR. + unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); + break; + } default: if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -2760,4 +2968,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getOperandsMapping(OpdsMapping), MI.getNumOperands()); } - diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 584b23c0c2204e..a14b74961118ae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -13,6 +13,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -42,6 +44,18 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const SIRegisterInfo *TRI; const SIInstrInfo *TII; + bool collectWaterfallOperands( + SmallSet &SGPROperandRegs, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef OpIndices) const; + + bool executeInWaterfallLoop( + MachineIRBuilder &B, + iterator_range Range, + SmallSet &SGPROperandRegs, + MachineRegisterInfo &MRI) const; + bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 7cffdf1a4dcf9e..9806e6b0714f67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -26,19 +26,59 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) { - static const unsigned SubRegs[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, - AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, - AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, - AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, - AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24, - AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29, - AMDGPU::sub30, AMDGPU::sub31 - }; - - assert(Channel < array_lengthof(SubRegs)); - return SubRegs[Channel]; +// Table of NumRegs sized pieces at every 32-bit offset. +static const uint16_t SubRegFromChannelTable[][32] = { + { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19, + AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, + AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, + AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31 + }, + { + AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4, + AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8, + AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12, + AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16, + AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20, + AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24, + AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28, + AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister + }, + { + AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5, + AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9, + AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13, + AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17, + AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21, + AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25, + AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29, + AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister + }, + { + AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6, + AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10, + AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14, + AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18, + AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22, + AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26, + AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30, + AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister + } +}; + +// FIXME: TableGen should generate something to make this manageable for all +// register classes. At a minimum we could use the opposite of +// composeSubRegIndices and go up from the base 32-bit subreg. +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) { + const unsigned NumRegIndex = NumRegs - 1; + + assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) && + "Not implemented"); + assert(Channel < array_lengthof(SubRegFromChannelTable[0])); + return SubRegFromChannelTable[NumRegIndex][Channel]; } void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 3453a8c1b0b396..9e713ca804a112 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -28,7 +28,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - static unsigned getSubRegFromChannel(unsigned Channel); + static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1); void reserveRegisterTuples(BitVector &, unsigned Reg) const; }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c32252f510d6d7..95f817b006eaac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -898,9 +898,6 @@ bool GCNPassConfig::addInstSelector() { addPass(createSILowerI1CopiesPass()); addPass(createSIFixupVectorISelPass()); addPass(createSIAddIMGInitPass()); - // FIXME: Remove this once the phi on CF_END is cleaned up by either removing - // LCSSA or other ways. - addPass(&UnreachableMachineBlockElimID); return false; } diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b946c308cf1274..94d1d350dfd2d6 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -143,6 +143,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyDLC, ImmTyGLC, ImmTySLC, + ImmTySWZ, ImmTyTFE, ImmTyD16, ImmTyClampSI, @@ -328,6 +329,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isDLC() const { return isImmTy(ImmTyDLC); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } + bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); } @@ -820,6 +822,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyDLC: OS << "DLC"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; + case ImmTySWZ: OS << "SWZ"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -6037,6 +6040,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"high", AMDGPUOperand::ImmTyHigh, true, nullptr}, diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 44c77a2faa3d94..12609054e010a5 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -9,13 +9,13 @@ //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern; -def MUBUFAddr64 : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; def MUBUFAddr64Atomic : ComplexPattern; def MUBUFScratchOffen : ComplexPattern; def MUBUFScratchOffset : ComplexPattern; -def MUBUFOffset : ComplexPattern; +def MUBUFOffset : ComplexPattern; def MUBUFOffsetNoGLC : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; @@ -56,6 +56,17 @@ class MTBUFAddr64Table { // MTBUF classes //===----------------------------------------------------------------------===// +class MTBUFGetBaseOpcode { + string ret = !subst("FORMAT_XY", "FORMAT_X", + !subst("FORMAT_XYZ", "FORMAT_X", + !subst("FORMAT_XYZW", "FORMAT_X", Op))); +} + +class getMTBUFElements { + int ret = 1; +} + + class MTBUF_Pseudo pattern=[]> : InstSI, @@ -69,6 +80,9 @@ class MTBUF_Pseudo (NAME); + Instruction BaseOpcode = !cast(MTBUFGetBaseOpcode.ret); + let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; @@ -92,6 +106,7 @@ class MTBUF_Pseudo has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; + bits<4> elements = 0; } class MTBUF_Real : @@ -128,17 +143,17 @@ class getMTBUFInsDA vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc), + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc) + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -183,51 +198,54 @@ class MTBUF_SetupAddr { class MTBUF_Load_Pseudo pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo.ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 1; let mayStore = 0; + let elements = elems; } multiclass MTBUF_Pseudo_Loads { - def _OFFSET : MTBUF_Load_Pseudo , + i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Load_Pseudo , + i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Load_Pseudo ; - def _IDXEN : MTBUF_Load_Pseudo ; - def _BOTHEN : MTBUF_Load_Pseudo ; + def _OFFEN : MTBUF_Load_Pseudo ; + def _IDXEN : MTBUF_Load_Pseudo ; + def _BOTHEN : MTBUF_Load_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Load_Pseudo ; - def _OFFEN_exact : MTBUF_Load_Pseudo ; - def _IDXEN_exact : MTBUF_Load_Pseudo ; - def _BOTHEN_exact : MTBUF_Load_Pseudo ; + def _OFFSET_exact : MTBUF_Load_Pseudo ; + def _OFFEN_exact : MTBUF_Load_Pseudo ; + def _IDXEN_exact : MTBUF_Load_Pseudo ; + def _BOTHEN_exact : MTBUF_Load_Pseudo ; } } class MTBUF_Store_Pseudo pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, @@ -235,39 +253,40 @@ class MTBUF_Store_Pseudo .ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 0; let mayStore = 1; + let elements = elems; } multiclass MTBUF_Pseudo_Stores { - def _OFFSET : MTBUF_Store_Pseudo , + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Store_Pseudo , + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Store_Pseudo ; - def _IDXEN : MTBUF_Store_Pseudo ; - def _BOTHEN : MTBUF_Store_Pseudo ; + def _OFFEN : MTBUF_Store_Pseudo ; + def _IDXEN : MTBUF_Store_Pseudo ; + def _BOTHEN : MTBUF_Store_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Store_Pseudo ; - def _OFFEN_exact : MTBUF_Store_Pseudo ; - def _IDXEN_exact : MTBUF_Store_Pseudo ; - def _BOTHEN_exact : MTBUF_Store_Pseudo ; + def _OFFSET_exact : MTBUF_Store_Pseudo ; + def _OFFEN_exact : MTBUF_Store_Pseudo ; + def _IDXEN_exact : MTBUF_Store_Pseudo ; + def _BOTHEN_exact : MTBUF_Store_Pseudo ; } } @@ -395,7 +414,7 @@ class getMUBUFInsDA vdataList, ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc)) + !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz)) ); } @@ -467,7 +486,7 @@ class MUBUF_Load_Pseudo .ret, !if(HasTiedDest, (ins getVregSrcForVT.ret:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe") # "$dlc", + !if(isLds, " lds", "$tfe") # "$dlc" # "$swz", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -485,15 +504,15 @@ class MUBUF_Load_Pseudo : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; class MUBUF_Addr64_Load_Pat : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; multiclass MUBUF_Pseudo_Load_Pats { @@ -544,7 +563,7 @@ class MUBUF_Store_Pseudo .ret]>.ret, - " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -560,12 +579,12 @@ multiclass MUBUF_Pseudo_Stores, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo , + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo ; @@ -583,8 +602,8 @@ multiclass MUBUF_Pseudo_Stores : MUBUF_Pseudo { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz), + " $srsrc, $soffset$offset lds$glc$slc$swz"> { let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; @@ -1067,35 +1086,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < // MTBUF Instructions //===----------------------------------------------------------------------===// -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; - defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>; - defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>; } // End HasPackedD16VMem. let SubtargetPredicate = isGFX7Plus in { @@ -1130,6 +1149,10 @@ def extract_dlc : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); }]>; +def extract_swz : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1138,32 +1161,36 @@ multiclass MUBUF_LoadIntrinsicPat { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), + timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), + timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1215,35 +1242,39 @@ multiclass MUBUF_StoreIntrinsicPat { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1447,8 +1478,8 @@ def : GCNPat< class MUBUFLoad_PatternADDR64 : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; multiclass MUBUFLoad_Atomic_Pattern ; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } @@ -1482,8 +1513,8 @@ multiclass MUBUFLoad_Pattern ; } @@ -1506,12 +1537,12 @@ multiclass MUBUFScratchLoadPat ; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1521,12 +1552,12 @@ multiclass MUBUFScratchLoadPat_D16 { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; } @@ -1566,16 +1597,16 @@ defm : MUBUFScratchLoadPat_D16 { - // Store follows atomic op convention so address is forst + // Store follows atomic op convention so address is first def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1589,8 +1620,8 @@ multiclass MUBUFStore_Pattern ; } @@ -1604,13 +1635,13 @@ multiclass MUBUFScratchStorePat ; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1649,36 +1680,40 @@ multiclass MTBUF_LoadIntrinsicPat { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0)), + timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm)), + timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0)), + timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm)), + timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1707,36 +1742,40 @@ multiclass MTBUF_StoreIntrinsicPat { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0), + timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm), + timm:$format, timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0), + timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, - timm:$offset, timm:$format, timm:$cachepolicy, timm), + timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -2403,3 +2442,22 @@ def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex { let Table = MUBUFInfoTable; let Key = ["BaseOpcode", "elements"]; } + +def MTBUFInfoTable : GenericTable { + let FilterClass = "MTBUF_Pseudo"; + let CppTypeName = "MTBUFInfo"; + let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMTBUFOpcodeHelper"; +} + +def getMTBUFInfoFromOpcode : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["Opcode"]; +} + +def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["BaseOpcode", "elements"]; +} diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4ec4be9bc48589..ec2e2c4e8b7174 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1095,6 +1095,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { case 106: return createRegOperand(VCC); case 108: return createRegOperand(TBA); case 110: return createRegOperand(TMA); + case 125: return createRegOperand(SGPR_NULL); case 126: return createRegOperand(EXEC); case 235: return createRegOperand(SRC_SHARED_BASE); case 236: return createRegOperand(SRC_SHARED_LIMIT); @@ -1172,7 +1173,8 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { int TTmpIdx = getTTmpIdx(Val); if (TTmpIdx >= 0) { - return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx); + auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32); + return createSRegOperand(TTmpClsId, TTmpIdx); } else if (Val > SGPR_MAX) { return IsWave64 ? decodeSpecialReg64(Val) : decodeSpecialReg32(Val); diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index b76552d1b674d2..bda84a6d2c53dc 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1184,7 +1184,7 @@ class FLAT_Real_gfx10 op, FLAT_Pseudo ps> : let AssemblerPredicate = isGFX10Plus; let DecoderNamespace = "GFX10"; - let Inst{11-0} = {offset{12}, offset{10-0}}; + let Inst{11-0} = offset{11-0}; let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue); let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d); let Inst{55} = 0; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 973491a70d3c1c..71c802b4a58ab3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -3,6 +3,8 @@ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// Notified per clause 4(b) of the license. // //===----------------------------------------------------------------------===// // @@ -230,33 +232,11 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand; - if (TopCand.Reason == BotCand.Reason) { - Cand = BotCand; - GenericSchedulerBase::CandReason TopReason = TopCand.Reason; - TopCand.Reason = NoCand; - GenericScheduler::tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); - } else { - TopCand.Reason = TopReason; - } - } else { - if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = BotCand; - } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = BotCand; - } else { - if (BotCand.Reason > TopCand.Reason) { - Cand = TopCand; - } else { - Cand = BotCand; - } - } + SchedCandidate Cand = BotCand; + TopCand.Reason = NoCand; + GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); } LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index a451625439759a..d2ea94548dfee0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "slc"); } +void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { +} + void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "tfe"); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 0f62f039763ef5..66b70831ff9efd 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -72,6 +72,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 383b3c80ebaaf4..10120c3d6f81dd 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -124,9 +124,8 @@ class SIFixSGPRCopies : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - // FIXME: Temporarily disable these flags as they do not currently hold - //AU.addPreserved(); - //AU.setPreservesCFG(); + AU.addPreserved(); + AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } }; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 45c06ebb547aca..ed07ed100a1924 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -669,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::NoAlloc: case TargetStackID::SGPRSpill: return true; + case TargetStackID::SVEVector: + return false; } llvm_unreachable("Invalid TargetStackID::Value"); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6a89a2cecb3509..2c7407993d7f02 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3840,8 +3840,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( if (NeedClampOperand) I.addImm(0); // clamp bit for e64 encoding - SmallSetVector Worklist; - TII->legalizeOperands(*I, Worklist); + TII->legalizeOperands(*I); MI.eraseFromParent(); return BB; @@ -6282,7 +6281,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(4), // soffset Offsets.second, // offset - Op.getOperand(5), // cachepolicy + Op.getOperand(5), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; @@ -6300,7 +6299,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; @@ -6349,7 +6348,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // soffset Offsets.second, // offset Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; @@ -6373,7 +6372,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; @@ -6843,7 +6842,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy + Op.getOperand(8), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : @@ -6868,7 +6867,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : @@ -6950,7 +6949,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = @@ -6994,7 +6993,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(6), // soffset Offsets.second, // offset - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp index 8f7e9292c9e821..64ab00f81a07c2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaterfall.cpp @@ -391,6 +391,9 @@ bool SIInsertWaterfall::removeRedundantWaterfall(WaterfallWorkitem &Item) { MRI->replaceRegWith(RFLDstReg, ReplaceReg); Removed++; ToRemoveRFLList.push_back(RFLMI); + } else if (RFLDstOp->isDead()) { + Removed++; + ToRemoveRFLList.push_back(RFLMI); } else { NewRFLList.push_back(RFLMI); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2dc77f59581795..10cccf79e67519 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2461,8 +2461,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // It might happen that UseMI was commuted // and we now have SGPR as SRC1. If so 2 inlined // constant and SGPR are illegal. - SmallSetVector Worklist; - legalizeOperands(UseMI, Worklist); + legalizeOperands(UseMI); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -3139,7 +3138,8 @@ static bool shouldReadExec(const MachineInstr &MI) { return true; } - if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || + if (MI.isPreISelOpcode() || + SIInstrInfo::isGenericOpcode(MI.getOpcode()) || SIInstrInfo::isSALU(MI) || SIInstrInfo::isSMRD(MI)) return false; @@ -3956,12 +3956,9 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, - MachineInstr &MI, - SetVectorType &Worklist) const { + MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); - MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction &MF = *MBB.getParent(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); MachineOperand &Src0 = MI.getOperand(Src0Idx); @@ -4011,125 +4008,15 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for - // lane select. - // Previous implementations assumed that a non SGPR operand meant - // that the value was uniform across all lanes - modified this behaviour to - // use a waterfall operation to process all indices. This will be a worst case - // of 64 iterations, but will only be a single iteration for a uniform across - // all lanes so the extra cost is low + // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane + // select is uniform. if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { - // Waterfall to read all the values across all lanes - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - - MachineBasicBlock::iterator I(&MI); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); const DebugLoc &DL = MI.getDebugLoc(); - - unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // Initialize the register we accumulate the result into - BuildMI(MBB, I, DL, get(AMDGPU::V_MOV_B32_e32), InitReg) - .addImm(0x0); - - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - BuildMI(MBB, I, DL, get(TargetOpcode::IMPLICIT_DEF), TmpExec); - - // Save the EXEC mask - BuildMI(MBB, I, DL, get(AMDGPU::S_MOV_B64), SaveExec) - .addReg(AMDGPU::EXEC); - - MachineBasicBlock &LoopBB = *MF.CreateMachineBasicBlock(); - MachineBasicBlock &RemainderBB = *MF.CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF.insert(MBBI, &LoopBB); - MF.insert(MBBI, &RemainderBB); - - LoopBB.addSuccessor(&LoopBB); - LoopBB.addSuccessor(&RemainderBB); - - RemainderBB.transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB.splice(RemainderBB.begin(), &MBB, I, MBB.end()); - - MBB.addSuccessor(&LoopBB); - - MachineBasicBlock::iterator J = LoopBB.begin(); - - unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned CurrentValue = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - BuildMI(LoopBB, J, DL, get(TargetOpcode::PHI), PhiReg) - .addReg(InitReg) - .addMBB(&MBB) - .addReg(NewDst) - .addMBB(&LoopBB); - - BuildMI(LoopBB, J, DL, get(TargetOpcode::PHI), PhiExec) - .addReg(TmpExec) - .addMBB(&MBB) - .addReg(NewExec) - .addMBB(&LoopBB); - - // Read the next variant <- also loop target. - BuildMI(LoopBB, J, DL, get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) - .addReg(Src1.getReg(), getUndefRegState(Src1.isUndef())); - - // Compare the just read value to all possible Idx values. - BuildMI(LoopBB, J, DL, get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) - .addReg(CurrentIdxReg) - .addReg(Src1.getReg(), 0, Src1.getSubReg()); - - // Update EXEC, save the original EXEC value to VCC. - BuildMI(LoopBB, J, DL, get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) - .addReg(CondReg, RegState::Kill); - - // TODO: Conditional branch here to loop header as a potential optimization? - - // Use readlane to get the value for all lanes with the current index - BuildMI(LoopBB, J, DL, get(AMDGPU::V_READLANE_B32), CurrentValue) - .addReg(Src0.getReg()) - .addReg(CurrentIdxReg); - - // Mov the just read value into the destination using or - // TODO: In theory a mov would do here - but this is tricky to get to work - // correctly as it seems to confuse the register allocator and other passes - BuildMI(LoopBB, J, DL, get(AMDGPU::V_OR_B32_e64), NewDst) - .addReg(PhiReg) - .addReg(CurrentValue); - - MRI.setSimpleHint(NewExec, CondReg); - - // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(LoopBB, J, DL, get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(NewExec); - - // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use - // s_cbranch_scc0? - - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. - BuildMI(LoopBB, J, DL, get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&LoopBB); - - MachineBasicBlock::iterator First = RemainderBB.begin(); - BuildMI(RemainderBB, First, DL, get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(SaveExec); - - MRI.replaceRegWith(DstReg, NewDst); - addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); - - MI.eraseFromParent(); - + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); return; } @@ -4568,14 +4455,13 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { } void SIInstrInfo::legalizeOperands(MachineInstr &MI, - SetVectorType &Worklist, MachineDominatorTree *MDT) const { MachineFunction &MF = *MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); // Legalize VOP2 if (isVOP2(MI) || isVOPC(MI)) { - legalizeOperandsVOP2(MRI, MI, Worklist); + legalizeOperandsVOP2(MRI, MI); return; } @@ -4829,6 +4715,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, MIB.addImm(TFE->getImm()); } + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); + MIB.cloneMemRefs(MI); Addr64 = MIB; } else { @@ -5046,7 +4934,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. - legalizeOperands(Inst, Worklist, MDT); + legalizeOperands(Inst, MDT); continue; } @@ -5138,7 +5026,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, } // Legalize the operands - legalizeOperands(Inst, Worklist, MDT); + legalizeOperands(Inst, MDT); if (HasDst) addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); @@ -5172,7 +5060,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit Inst.addImplicitDefUseOperands(*MBB.getParent()); MRI.replaceRegWith(OldDstReg, ResultReg); - legalizeOperands(Inst, Worklist, MDT); + legalizeOperands(Inst, MDT); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); return true; @@ -5450,8 +5338,8 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, // Try to legalize the operands in case we need to swap the order to keep it // valid. - legalizeOperands(*LoHalf, Worklist, MDT); - legalizeOperands(*HiHalf, Worklist, MDT); + legalizeOperands(*LoHalf, MDT); + legalizeOperands(*HiHalf, MDT); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); @@ -5799,7 +5687,16 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( if (RI.hasAGPRs(NewDstRC)) return nullptr; - NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + switch (Inst.getOpcode()) { + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + break; + default: + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + } + if (!NewDstRC) return nullptr; } else { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index a1306f1fe65dbe..a1a3962c413009 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -3,8 +3,6 @@ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. -// Notified per clause 4(b) of the license. // //===----------------------------------------------------------------------===// // @@ -853,9 +851,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. - void legalizeOperandsVOP2(MachineRegisterInfo &MRI, - MachineInstr &MI, - SetVectorType &Worklist) const; + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; /// Fix operands in \p MI to satisfy constant bus requirements. void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const; @@ -879,7 +875,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// instructions and control-flow around \p MI. If present, \p MDT is /// updated. void legalizeOperands(MachineInstr &MI, - SetVectorType &Worklist, MachineDominatorTree *MDT = nullptr) const; /// Replace this instruction's opcode with the equivalent VALU diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index e1b32c4964c452..7473a0c64b2f94 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, @@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, @@ -1035,6 +1035,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index a695e2bdb713b6..6cd7e9462d6ea9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2118,3 +2118,13 @@ def : FP16Med3Pat; defm : Int16Med3Pat; defm : Int16Med3Pat; } // End Predicates = [isGFX9Plus] + +class AMDGPUGenericInstruction : GenericInstruction { + let Namespace = "AMDGPU"; +} + +def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = 0; +} diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b9d6839d2b00b7..233316f9b3d462 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -158,6 +158,31 @@ class SILoadStoreOptimizer : public MachineFunctionPass { return true; } + bool hasMergeableAddress(const MachineRegisterInfo &MRI) { + for (unsigned i = 0; i < NumAddresses; ++i) { + const MachineOperand *AddrOp = AddrReg[i]; + // Immediates are always OK. + if (AddrOp->isImm()) + continue; + + // Don't try to merge addresses that aren't either immediates or registers. + // TODO: Should be possible to merge FrameIndexes and maybe some other + // non-register + if (!AddrOp->isReg()) + return false; + + // TODO: We should be able to merge physical reg addreses. + if (Register::isPhysicalRegister(AddrOp->getReg())) + return false; + + // If an address has only one use then there will be on other + // instructions with the same address, so we can't merge this one. + if (MRI.hasOneNonDBGUse(AddrOp->getReg())) + return false; + } + return true; + } + void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, const GCNSubtarget &STM); void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); @@ -186,7 +211,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { AliasAnalysis *AA = nullptr; bool OptimizeAgain; - bool dmasksCanBeCombined(const CombineInfo &CI); + static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII); static bool offsetsCanBeCombined(CombineInfo &CI); static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); static unsigned getNewOpcode(const CombineInfo &CI); @@ -202,7 +227,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { unsigned write2Opcode(unsigned EltSize) const; unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); - MachineBasicBlock::iterator mergeImageSamplePair(CombineInfo &CI); + MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); @@ -219,6 +244,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited, SmallPtrSet &Promoted) const; + void addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const; + bool collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const; public: static char ID; @@ -227,7 +256,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - bool optimizeBlock(MachineBasicBlock &MBB); + void removeCombinedInst(std::list &MergeList, + const MachineInstr &MI); + bool optimizeInstsWithSameBaseAddr(std::list &MergeList, + bool &OptimizeListAgain); + bool optimizeBlock(std::list > &MergeableInsts); bool runOnMachineFunction(MachineFunction &MF) override; @@ -248,7 +281,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { // FIXME: Handle d16 correctly return AMDGPU::getMUBUFElements(Opc); } - if (TII.isMIMG(MI)) { uint64_t DMaskImm = TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); @@ -286,7 +318,8 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: return BUFFER_STORE; } - } else if (TII.isMIMG(Opc)) { + } + if (TII.isMIMG(Opc)) { // Ignore instructions encoded without vaddr. if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) return UNKNOWN; @@ -468,6 +501,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); AddrReg[i] = &I->getOperand(AddrIdx[i]); } + + InstsToMove.clear(); } void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, @@ -601,12 +636,12 @@ static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, return MMO; } -bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI) { +bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) { assert(CI.InstClass == MIMG); // Ignore instructions with tfe/lwe set. - const auto *TFEOp = TII->getNamedOperand(*CI.I, AMDGPU::OpName::tfe); - const auto *LWEOp = TII->getNamedOperand(*CI.I, AMDGPU::OpName::lwe); + const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); + const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) return false; @@ -728,14 +763,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } const unsigned InstSubclass = getInstSubclass(Opc, *TII); - for (unsigned i = 0; i < CI.NumAddresses; i++) { - // We only ever merge operations with the same base address register, so - // don't bother scanning forward if there are no other uses. - if (CI.AddrReg[i]->isReg() && - (Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) || - MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg()))) - return false; - } + // Do not merge VMEM buffer instructions with "swizzled" bit set. + int Swizzled = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) + return false; ++MBBI; @@ -747,7 +779,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { - // This is not a matching DS instruction, but we can keep looking as + // This is not a matching instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. // 2. It is safe to move MBBI down past the instruction that I will @@ -797,14 +829,13 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { if (Match) { CI.setPaired(MBBI, *TII); - // Check both offsets (or masks for MIMG) fit in the reduced - // range. + // Check both offsets (or masks for MIMG) can be combined and fit in the + // reduced range. bool canBeCombined = CI.InstClass == MIMG - ? dmasksCanBeCombined(CI) + ? dmasksCanBeCombined(CI, *TII) : widthsFit(*STM, CI) && offsetsCanBeCombined(CI); - // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. @@ -914,12 +945,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Next; + return Read2; } unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { @@ -998,23 +1028,22 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Next; + return Write2; } MachineBasicBlock::iterator -SILoadStoreOptimizer::mergeImageSamplePair(CombineInfo &CI) { +SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); const unsigned Opcode = getNewOpcode(CI); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - unsigned DestReg = MRI->createVirtualRegister(SuperRC); + Register DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedDMask = CI.DMask0 | CI.DMask1; unsigned DMaskIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); @@ -1023,7 +1052,16 @@ SILoadStoreOptimizer::mergeImageSamplePair(CombineInfo &CI) { for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { (I == DMaskIdx) ? MIB.addImm(MergedDMask) : MIB.add((*CI.I).getOperand(I)); } - MIB.cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + + MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1043,10 +1081,9 @@ SILoadStoreOptimizer::mergeImageSamplePair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineBasicBlock::iterator @@ -1068,12 +1105,13 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1093,10 +1131,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineBasicBlock::iterator @@ -1127,14 +1164,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1154,10 +1193,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { @@ -1293,21 +1331,22 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineOperand @@ -1620,32 +1659,105 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( return false; } -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; +void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const { + for (std::list &AddrList : MergeableInsts) { + if (AddrList.front().hasSameBaseAddress(*CI.I) && + AddrList.front().InstClass == CI.InstClass) { + AddrList.emplace_back(CI); + return; + } + } + + // Base address not found, so add a new list. + MergeableInsts.emplace_back(1, CI); +} +bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const { + bool Modified = false; // Contain the list MemInfoMap Visited; // Contains the list of instructions for which constant offsets are being // promoted to the IMM. SmallPtrSet AnchorList; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - + // Sort potential mergeable instructions into lists. One list per base address. + for (MachineInstr &MI : MBB.instrs()) { + // We run this before checking if an address is mergeable, because it can produce + // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); + if (InstClass == UNKNOWN) + continue; + // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; + if (MI.hasOrderedMemoryRef()) continue; - } CombineInfo CI; - CI.setMI(I, *TII, *STM); + CI.setMI(MI, *TII, *STM); + + if (!CI.hasMergeableAddress(*MRI)) + continue; + + addInstToMergeableList(CI, MergeableInsts); + } + return Modified; +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock( + std::list > &MergeableInsts) { + bool Modified = false; + + for (std::list &MergeList : MergeableInsts) { + if (MergeList.size() < 2) + continue; + + bool OptimizeListAgain = false; + if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { + // We weren't able to make any changes, so clear the list so we don't + // process the same instructions the next time we try to optimize this + // block. + MergeList.clear(); + continue; + } + + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) + MergeList.clear(); + + OptimizeAgain |= OptimizeListAgain; + Modified = true; + } + return Modified; +} + +void +SILoadStoreOptimizer::removeCombinedInst(std::list &MergeList, + const MachineInstr &MI) { + + for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { + if (&*CI->I == &MI) { + MergeList.erase(CI); + return; + } + } +} + +bool +SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( + std::list &MergeList, + bool &OptimizeListAgain) { + bool Modified = false; + for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { + CombineInfo &CI = *I; switch (CI.InstClass) { default: @@ -1653,57 +1765,60 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { case DS_READ: if (findMatchingInst(CI)) { Modified = true; - I = mergeRead2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case DS_WRITE: if (findMatchingInst(CI)) { Modified = true; - I = mergeWrite2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case S_BUFFER_LOAD_IMM: if (findMatchingInst(CI)) { Modified = true; - I = mergeSBufferLoadImmPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16; } - continue; + break; case BUFFER_LOAD: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferLoadPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; case BUFFER_STORE: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferStorePair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; case MIMG: if (findMatchingInst(CI)) { Modified = true; - I = mergeImageSamplePair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeImagePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; } - - ++I; + // Clear the InstsToMove after we have finished searching so we don't have + // stale values left over if we search for this CI again in another pass + // over the block. + CI.InstsToMove.clear(); } return Modified; @@ -1729,10 +1844,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; + for (MachineBasicBlock &MBB : MF) { + std::list > MergeableInsts; + // First pass: Collect list of all instructions we know how to merge. + Modified |= collectMergeableInsts(MBB, MergeableInsts); do { OptimizeAgain = false; - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MergeableInsts); } while (OptimizeAgain); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index cc5232a7d46ac4..f0cda3f8d18c5e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -619,6 +619,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -739,6 +740,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index bb4169788f4665..afb2fd987afdca 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -137,10 +137,51 @@ struct MUBUFInfo { bool has_soffset; }; +struct MTBUFInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t elements; + bool has_vaddr; + bool has_srsrc; + bool has_soffset; +}; + +#define GET_MTBUFInfoTable_DECL +#define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" +int getMTBUFBaseOpcode(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc); + return Info ? Info->BaseOpcode : -1; +} + +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) { + const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements); + return Info ? Info->Opcode : -1; +} + +int getMTBUFElements(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->elements : 0; +} + +bool getMTBUFHasVAddr(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_vaddr : false; +} + +bool getMTBUFHasSrsrc(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_srsrc : false; +} + +bool getMTBUFHasSoffset(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_soffset : false; +} + int getMUBUFBaseOpcode(unsigned Opc) { const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc); return Info ? Info->BaseOpcode : -1; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a578fd2bb6a9a2..f78dadd447ff5d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -263,6 +263,24 @@ struct MIMGInfo { LLVM_READONLY const MIMGInfo *getMIMGInfo(unsigned Opc); +LLVM_READONLY +int getMTBUFBaseOpcode(unsigned Opc); + +LLVM_READONLY +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements); + +LLVM_READONLY +int getMTBUFElements(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasVAddr(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSrsrc(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSoffset(unsigned Opc); + LLVM_READONLY int getMUBUFBaseOpcode(unsigned Opc); diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index bea0c7bd080d2a..e1e35c9ba13af6 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -235,7 +235,7 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>; -defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>; defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index e5f215fbcd7196..15604aa13533b7 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -752,19 +752,22 @@ multiclass Bits_OpsRev_i16_Pats ; def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)) + !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)), + (inst VSrc_b32:$src1, VSrc_b32:$src0)) >; def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, - !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)), + !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)), + (inst VSrc_b32:$src1, VSrc_b32:$src0)), sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 46382816b98399..1eaf871867e055 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -191,7 +191,7 @@ getReservedRegs(const MachineFunction &MF) const { markSuperRegs(Reserved, ARM::PC); markSuperRegs(Reserved, ARM::FPSCR); markSuperRegs(Reserved, ARM::APSR_NZCV); - if (TFI->hasFP(MF)) + if (TFI->hasFP(MF) || STI.isTargetDarwin()) markSuperRegs(Reserved, getFramePointerReg(STI)); if (hasBasePointer(MF)) markSuperRegs(Reserved, BasePtr); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index d02d495f3adc88..45bf6763382281 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7482,6 +7482,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize >= 32 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + ShuffleVectorInst::isIdentityMask(M) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16)) @@ -13121,7 +13122,8 @@ static SDValue PerformLOADCombine(SDNode *N, // Optimize trunc store (of multiple scalars) to shuffle and store. First, // pack all of the elements in one place. Next, store to memory in fewer // chunks. -SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) { +static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, + SelectionDAG &DAG) { SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); if (!St->isTruncatingStore() || !VT.isVector()) @@ -13205,7 +13207,8 @@ SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) { // Try taking a single vector store from an truncate (which would otherwise turn // into an expensive buildvector) and splitting it into a series of narrowing // stores. -SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG) { +static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, + SelectionDAG &DAG) { if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); @@ -13695,7 +13698,7 @@ static SDValue PerformShiftCombine(SDNode *N, // Look for a sign/zero extend of a larger than legal load. This can be split // into two extending loads, which are simpler to deal with than an arbitrary // sign extend. -SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::LOAD) return SDValue(); diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index dd2d4ec118aa8a..4d2c9dad7099ea 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -11339,7 +11339,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) { SmallVector Opcodes; auto parseOne = [&]() -> bool { - const MCExpr *OE; + const MCExpr *OE = nullptr; SMLoc OpcodeLoc = getLexer().getLoc(); if (check(getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE), diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index c682c46fe31632..5a9a34e4af3c28 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -90,6 +90,13 @@ class BPFAbstractMemberAccess final : public ModulePass { static char ID; BPFAbstractMemberAccess() : ModulePass(ID) {} + struct CallInfo { + uint32_t Kind; + uint32_t AccessIndex; + MDNode *Metadata; + Value *Base; + }; + private: enum : uint32_t { BPFPreserveArrayAI = 1, @@ -99,34 +106,32 @@ class BPFAbstractMemberAccess final : public ModulePass { std::map GEPGlobals; // A map to link preserve_*_access_index instrinsic calls. - std::map> AIChain; + std::map> AIChain; // A map to hold all the base preserve_*_access_index instrinsic calls. // The base call is not an input of any other preserve_*_access_index // intrinsics. - std::map BaseAICalls; + std::map BaseAICalls; bool doTransformation(Module &M); - void traceAICall(CallInst *Call, uint32_t Kind, const MDNode *ParentMeta, - uint32_t ParentAI); - void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, uint32_t ParentAI); - void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, uint32_t ParentAI); + void traceAICall(CallInst *Call, CallInfo &ParentInfo); + void traceBitCast(BitCastInst *BitCast, CallInst *Parent, + CallInfo &ParentInfo); + void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, + CallInfo &ParentInfo); void collectAICallChains(Module &M, Function &F); - bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind, - const MDNode *&TypeMeta, uint32_t &AccessIndex); + bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo); bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI, const MDNode *ChildMeta); bool removePreserveAccessIndexIntrinsic(Module &M); void replaceWithGEP(std::vector &CallList, uint32_t NumOfZerosIndex, uint32_t DIIndex); - Value *computeBaseAndAccessKey(CallInst *Call, std::string &AccessKey, - uint32_t Kind, MDNode *&BaseMeta); - bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex); - bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind); + Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo, + std::string &AccessKey, MDNode *&BaseMeta); + uint64_t getConstant(const Value *IndexValue); + bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo); }; } // End anonymous namespace @@ -142,7 +147,7 @@ bool BPFAbstractMemberAccess::runOnModule(Module &M) { LLVM_DEBUG(dbgs() << "********** Abstract Member Accesses **********\n"); // Bail out if no debug info. - if (empty(M.debug_compile_units())) + if (M.debug_compile_units().empty()) return false; return doTransformation(M); @@ -192,9 +197,7 @@ static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) { /// Check whether a call is a preserve_*_access_index intrinsic call or not. bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, - uint32_t &Kind, - const MDNode *&TypeMeta, - uint32_t &AccessIndex) { + CallInfo &CInfo) { if (!Call) return false; @@ -202,30 +205,30 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, if (!GV) return false; if (GV->getName().startswith("llvm.preserve.array.access.index")) { - Kind = BPFPreserveArrayAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveArrayAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.array.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(2)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.union.access.index")) { - Kind = BPFPreserveUnionAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveUnionAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(1)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(1)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.struct.access.index")) { - Kind = BPFPreserveStructAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveStructAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(2)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); return true; } @@ -238,8 +241,7 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector &CallList, for (auto Call : CallList) { uint32_t Dimension = 1; if (DimensionIndex > 0) - Dimension = cast(Call->getArgOperand(DimensionIndex)) - ->getZExtValue(); + Dimension = getConstant(Call->getArgOperand(DimensionIndex)); Constant *Zero = ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0); @@ -265,16 +267,14 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) { for (auto &BB : F) for (auto &I : BB) { auto *Call = dyn_cast(&I); - uint32_t Kind; - const MDNode *TypeMeta; - uint32_t AccessIndex; - if (!IsPreserveDIAccessIndexCall(Call, Kind, TypeMeta, AccessIndex)) + CallInfo CInfo; + if (!IsPreserveDIAccessIndexCall(Call, CInfo)) continue; Found = true; - if (Kind == BPFPreserveArrayAI) + if (CInfo.Kind == BPFPreserveArrayAI) PreserveArrayIndexCalls.push_back(Call); - else if (Kind == BPFPreserveUnionAI) + else if (CInfo.Kind == BPFPreserveUnionAI) PreserveUnionIndexCalls.push_back(Call); else PreserveStructIndexCalls.push_back(Call); @@ -349,99 +349,94 @@ bool BPFAbstractMemberAccess::IsValidAIChain(const MDNode *ParentType, return dyn_cast(stripQualifiers(Ty)) == CTy; } -void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind, - const MDNode *ParentMeta, - uint32_t ParentAI) { +void BPFAbstractMemberAccess::traceAICall(CallInst *Call, + CallInfo &ParentInfo) { for (User *U : Call->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Call, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Call, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Call, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Call, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Call, Kind, ParentMeta, ParentAI); + traceGEP(GI, Call, ParentInfo); else - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } else { - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } } } void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast, - CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, - uint32_t ParentAI) { + CallInst *Parent, + CallInfo &ParentInfo) { for (User *U : BitCast->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Parent, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind, ParentMeta, ParentAI); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } } void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent, - uint32_t Kind, const MDNode *ParentMeta, - uint32_t ParentAI) { + CallInfo &ParentInfo) { for (User *U : GEP->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Parent, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind, ParentMeta, ParentAI); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } } @@ -452,44 +447,37 @@ void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) { for (auto &BB : F) for (auto &I : BB) { - uint32_t Kind; - const MDNode *TypeMeta; - uint32_t AccessIndex; + CallInfo CInfo; auto *Call = dyn_cast(&I); - if (!IsPreserveDIAccessIndexCall(Call, Kind, TypeMeta, AccessIndex) || + if (!IsPreserveDIAccessIndexCall(Call, CInfo) || AIChain.find(Call) != AIChain.end()) continue; - traceAICall(Call, Kind, TypeMeta, AccessIndex); + traceAICall(Call, CInfo); } } -/// Get access index from the preserve_*_access_index intrinsic calls. -bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue, - uint64_t &AccessIndex) { +uint64_t BPFAbstractMemberAccess::getConstant(const Value *IndexValue) { const ConstantInt *CV = dyn_cast(IndexValue); - if (!CV) - return false; - - AccessIndex = CV->getValue().getZExtValue(); - return true; + assert(CV); + return CV->getValue().getZExtValue(); } /// Compute the base of the whole preserve_*_access_index chains, i.e., the base /// pointer of the first preserve_*_access_index call, and construct the access /// string, which will be the name of a global variable. Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, + CallInfo &CInfo, std::string &AccessKey, - uint32_t Kind, MDNode *&TypeMeta) { Value *Base = nullptr; std::string TypeName; - std::stack> CallStack; + std::stack> CallStack; // Put the access chain into a stack with the top as the head of the chain. while (Call) { - CallStack.push(std::make_pair(Call, Kind)); - Kind = AIChain[Call].second; + CallStack.push(std::make_pair(Call, CInfo)); + CInfo = AIChain[Call].second; Call = AIChain[Call].first; } @@ -508,14 +496,14 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, while (CallStack.size()) { auto StackElem = CallStack.top(); Call = StackElem.first; - Kind = StackElem.second; + CInfo = StackElem.second; if (!Base) - Base = Call->getArgOperand(0); + Base = CInfo.Base; - MDNode *MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); - DIType *Ty = stripQualifiers(cast(MDN)); - if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) { + DIType *Ty = stripQualifiers(cast(CInfo.Metadata)); + if (CInfo.Kind == BPFPreserveUnionAI || + CInfo.Kind == BPFPreserveStructAI) { // struct or union type TypeName = Ty->getName(); TypeMeta = Ty; @@ -527,9 +515,7 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, CallStack.pop(); // BPFPreserveArrayAI - uint64_t AccessIndex; - if (!getAccessIndex(Call->getArgOperand(2), AccessIndex)) - return nullptr; + uint64_t AccessIndex = CInfo.AccessIndex; DIType *BaseTy = nullptr; bool CheckElemType = false; @@ -580,18 +566,14 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, // and access key construction. while (CallStack.size()) { auto StackElem = CallStack.top(); - Call = StackElem.first; - Kind = StackElem.second; + CInfo = StackElem.second; CallStack.pop(); // Access Index - uint64_t AccessIndex; - uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2; - if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex)) - return nullptr; + uint64_t AccessIndex = CInfo.AccessIndex; AccessKey += ":" + std::to_string(AccessIndex); - MDNode *MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); + MDNode *MDN = CInfo.Metadata; // At this stage, it cannot be pointer type. auto *CTy = cast(stripQualifiers(cast(MDN))); uint32_t Tag = CTy->getTag(); @@ -615,11 +597,11 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, /// Call/Kind is the base preserve_*_access_index() call. Attempts to do /// transformation to a chain of relocable GEPs. bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call, - uint32_t Kind) { + CallInfo &CInfo) { std::string AccessKey; MDNode *TypeMeta; Value *Base = - computeBaseAndAccessKey(Call, AccessKey, Kind, TypeMeta); + computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta); if (!Base) return false; diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index e61e7346805780..218b0302927c54 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -59,7 +59,7 @@ bool BPFAsmPrinter::doInitialization(Module &M) { AsmPrinter::doInitialization(M); // Only emit BTF when debuginfo available. - if (MAI->doesSupportDebugInformation() && !empty(M.debug_compile_units())) { + if (MAI->doesSupportDebugInformation() && !M.debug_compile_units().empty()) { BTF = new BTFDebug(this); Handlers.push_back(HandlerInfo(std::unique_ptr(BTF), "emit", "Debug Info Emission", "BTF", diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 759a7fdb32b852..142e9cebb79e9a 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -485,8 +485,11 @@ getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo, assert(MO.isExpr() && "getJumpOffset16OpValue expects only expressions or an immediate"); - // TODO: Push fixup. - return 0; + const MCExpr *Expr = MO.getExpr(); + Mips::Fixups FixupKind = + isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16 : Mips::fixup_Mips_LO16; + Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind))); + return 0; } /// getJumpTargetOpValue - Return binary encoding of the jump diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index ddeec03ba784ce..79c47d1b650843 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -143,12 +143,15 @@ class MipsMCInstrAnalysis : public MCInstrAnalysis { return false; switch (Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType) { case MCOI::OPERAND_UNKNOWN: - case MCOI::OPERAND_IMMEDIATE: - // jal, bal ... - Target = Inst.getOperand(NumOps - 1).getImm(); + case MCOI::OPERAND_IMMEDIATE: { + // j, jal, jalx, jals + // Absolute branch within the current 256 MB-aligned region + uint64_t Region = Addr & ~uint64_t(0xfffffff); + Target = Region + Inst.getOperand(NumOps - 1).getImm(); return true; + } case MCOI::OPERAND_PCREL: - // b, j, beq ... + // b, beq ... Target = Addr + Inst.getOperand(NumOps - 1).getImm(); return true; default: diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index 55efe2cdc83a99..166ddea0431f3c 100644 --- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -865,12 +865,15 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); MipsFunctionInfo *MipsFI = MF.getInfo(); MipsABIInfo ABI = STI.getABI(); + unsigned RA = ABI.IsN64() ? Mips::RA_64 : Mips::RA; unsigned FP = ABI.GetFramePtr(); unsigned BP = ABI.IsN64() ? Mips::S7_64 : Mips::S7; - // Mark $fp as used if function has dedicated frame pointer. - if (hasFP(MF)) + // Mark $ra and $fp as used if function has dedicated frame pointer. + if (hasFP(MF)) { + setAliasRegs(MF, SavedRegs, RA); setAliasRegs(MF, SavedRegs, FP); + } // Mark $s7 as used if function has dedicated base pointer. if (hasBP(MF)) setAliasRegs(MF, SavedRegs, BP); diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h index ce594e1fb4fa5b..80ab1ea9f635f3 100644 --- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -120,7 +120,7 @@ class MipsSEDAGToDAGISel : public MipsDAGToDAGISel { /// power of 2. bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override; /// Select constant vector splats whose value is a run of set bits - /// ending at the most significant bit + /// ending at the most significant bit. bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override; /// Select constant vector splats whose value is a run of set bits /// starting at bit zero. diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp index d467f5c4a43921..fb9dd5d7aa7588 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -19,8 +19,8 @@ using namespace llvm; const PPCMCExpr* PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin); + bool IsDarwin, MCContext &Ctx) { + return new (Ctx) PPCMCExpr(Kind, Expr, IsDarwin); } void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index 449e2c34f74df0..ad1454566162ac 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -45,21 +45,21 @@ class PPCMCExpr : public MCTargetExpr { /// @{ static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr, - bool isDarwin, MCContext &Ctx); + bool IsDarwin, MCContext &Ctx); static const PPCMCExpr *createLo(const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return create(VK_PPC_LO, Expr, isDarwin, Ctx); + bool IsDarwin, MCContext &Ctx) { + return create(VK_PPC_LO, Expr, IsDarwin, Ctx); } static const PPCMCExpr *createHi(const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return create(VK_PPC_HI, Expr, isDarwin, Ctx); + bool IsDarwin, MCContext &Ctx) { + return create(VK_PPC_HI, Expr, IsDarwin, Ctx); } static const PPCMCExpr *createHa(const MCExpr *Expr, - bool isDarwin, MCContext &Ctx) { - return create(VK_PPC_HA, Expr, isDarwin, Ctx); + bool IsDarwin, MCContext &Ctx) { + return create(VK_PPC_HA, Expr, IsDarwin, Ctx); } /// @} diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 667eb91059990c..0534773c4c9ed9 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -50,10 +50,10 @@ namespace llvm { FunctionPass *createPPCExpandISELPass(); FunctionPass *createPPCPreEmitPeepholePass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - AsmPrinter &AP, bool isDarwin); + AsmPrinter &AP, bool IsDarwin); bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &OutMO, AsmPrinter &AP, - bool isDarwin); + bool IsDarwin); void initializePPCCTRLoopsPass(PassRegistry&); #ifndef NDEBUG diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index db30e16d154782..b9e52a11274f01 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -517,7 +517,7 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, /// void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst TmpInst; - bool isDarwin = TM.getTargetTriple().isOSDarwin(); + const bool IsDarwin = TM.getTargetTriple().isOSDarwin(); const Module *M = MF->getFunction().getParent(); PICLevel::Level PL = M->getPICLevel(); @@ -604,7 +604,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { // addis r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@ha // addi r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@l // Get the offset from the GOT Base Register to the GOT - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); if (Subtarget->isSecurePlt() && isPositionIndependent() ) { unsigned PICR = TmpInst.getOperand(0).getReg(); MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol( @@ -655,10 +655,10 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } } case PPC::LWZtoc: { - assert(!isDarwin && "TOC is an ELF/XCOFF construct."); + assert(!IsDarwin && "TOC is an ELF/XCOFF construct."); // Transform %rN = LWZtoc @op1, %r2 - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); // Change the opcode to LWZ. TmpInst.setOpcode(PPC::LWZ); @@ -724,7 +724,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::LDtocBA: case PPC::LDtoc: { // Transform %x3 = LDtoc @min1, %x2 - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); // Change the opcode to LD, and the global address operand to be a // reference to the TOC entry we will synthesize later. @@ -755,7 +755,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::ADDIStocHA8: { // Transform %xd = ADDIStocHA8 %x2, @sym - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); // Change the opcode to ADDIS8. If the global address is external, has // common linkage, is a non-local function address, or is a jump table @@ -801,7 +801,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } case PPC::LDtocL: { // Transform %xd = LDtocL @sym, %xs - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); // Change the opcode to LD. If the global address is external, has // common linkage, or is a jump table address, then reference the @@ -843,7 +843,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } case PPC::ADDItocL: { // Transform %xd = ADDItocL %xs, @sym - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); // Change the opcode to ADDI8. If the global address is external, then // generate a TOC entry and reference that. Otherwise reference the @@ -888,7 +888,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::LDgotTprelL: case PPC::LDgotTprelL32: { // Transform %xd = LDgotTprelL @sym, %xs - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); // Change the opcode to LD. TmpInst.setOpcode(Subtarget->isPPC64() ? PPC::LD : PPC::LWZ); @@ -1130,7 +1130,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { } } - LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin); EmitToStreamer(*OutStreamer, TmpInst); } diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 8306eb679dd840..06a4d183e78193 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1776,8 +1776,8 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, // Save R31 if necessary int FPSI = FI->getFramePointerSaveIndex(); - bool isPPC64 = Subtarget.isPPC64(); - bool isDarwinABI = Subtarget.isDarwinABI(); + const bool isPPC64 = Subtarget.isPPC64(); + const bool IsDarwinABI = Subtarget.isDarwinABI(); MachineFrameInfo &MFI = MF.getFrameInfo(); // If the frame pointer save index hasn't been defined yet. @@ -1826,7 +1826,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the // function uses CR 2, 3, or 4. - if (!isPPC64 && !isDarwinABI && + if (!isPPC64 && !IsDarwinABI && (SavedRegs.test(PPC::CR2) || SavedRegs.test(PPC::CR3) || SavedRegs.test(PPC::CR4))) { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8d8ffc1199f359..8cf6a660b08bd2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14516,7 +14516,7 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, Register PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const { bool isPPC64 = Subtarget.isPPC64(); - bool isDarwinABI = Subtarget.isDarwinABI(); + bool IsDarwinABI = Subtarget.isDarwinABI(); if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || (!isPPC64 && VT != MVT::i32)) @@ -14525,8 +14525,8 @@ Register PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, bool is64Bit = isPPC64 && VT == MVT::i64; Register Reg = StringSwitch(RegName) .Case("r1", is64Bit ? PPC::X1 : PPC::R1) - .Case("r2", (isDarwinABI || isPPC64) ? Register() : PPC::R2) - .Case("r13", (!isPPC64 && isDarwinABI) ? Register() : + .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2) + .Case("r13", (!isPPC64 && IsDarwinABI) ? Register() : (is64Bit ? PPC::X13 : PPC::R13)) .Default(Register()); diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 2b413d0b97abc7..06533fe0de33be 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2273,7 +2273,7 @@ void PPCInstrInfo::replaceInstrOperandWithImm(MachineInstr &MI, Register InUseReg = MI.getOperand(OpNo).getReg(); MI.getOperand(OpNo).ChangeToImmediate(Imm); - if (empty(MI.implicit_operands())) + if (MI.implicit_operands().empty()) return; // We need to make sure that the MI didn't have any implicit use @@ -3571,16 +3571,20 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, } else { // The 32 bit and 64 bit instructions are quite different. if (SpecialShift32) { - // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31). - uint64_t SH = RightShift ? 32 - ShAmt : ShAmt; + // Left shifts use (N, 0, 31-N). + // Right shifts use (32-N, N, 31) if 0 < N < 32. + // use (0, 0, 31) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 32 - ShAmt : ShAmt; uint64_t MB = RightShift ? ShAmt : 0; uint64_t ME = RightShift ? 31 : 31 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB) .addImm(ME); } else { - // Left shifts use (N, 63-N), right shifts use (64-N, N). - uint64_t SH = RightShift ? 64 - ShAmt : ShAmt; + // Left shifts use (N, 63-N). + // Right shifts use (64-N, N) if 0 < N < 64. + // use (0, 0) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 64 - ShAmt : ShAmt; uint64_t ME = RightShift ? ShAmt : 63 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME); diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 096eb1e0175ce3..24183277519b6e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -3575,23 +3575,6 @@ def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)), (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), (LO16 imm:$imm)), sub_eq)>; -defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)), - (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)), - (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)), - (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)), - (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)), - (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; -defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)), - (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; - -defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)), - (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), - (LO16 imm:$imm)), sub_eq)>; - def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)), (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)), @@ -3603,17 +3586,6 @@ def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)), def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)), (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)), - (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)), - (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)), - (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)), - (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)), - (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; - // SETCC for i64. def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)), (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; @@ -3643,6 +3615,47 @@ def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)), (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), (LO16 imm:$imm)), sub_eq)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; + +// Instantiations of CRNotPat for i32. +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; + +// Instantiations of CRNotPat for i64. defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)), (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)), @@ -3660,17 +3673,6 @@ defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)), (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), (LO16 imm:$imm)), sub_eq)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)), - (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)), - (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)), - (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)), - (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)), - (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; - defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)), (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)), @@ -3682,6 +3684,56 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)), defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)), (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; +let Predicates = [HasFPU] in { +// Instantiations of CRNotPat for f32. +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; + +// Instantiations of CRNotPat for f64. +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; + +// Instantiations of CRNotPat for f128. +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)), + (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; +} + // SETCC for f32. let Predicates = [HasFPU] in { def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)), @@ -3699,21 +3751,6 @@ def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)), def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)), (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; - // SETCC for f64. def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)), (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; @@ -3730,21 +3767,6 @@ def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)), def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)), (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; - // SETCC for f128. def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)), (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; @@ -3761,21 +3783,6 @@ def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)), def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)), (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; - } // This must be in this file because it relies on patterns defined in this file diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 2f025b5a59d4dd..2aad5860d87f01 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -1011,21 +1011,21 @@ def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG $S, sub_64))>; } -// Additional fnmsub patterns: -a*c + b == -(a*c - b) -def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), - (XSNMSUBADP $B, $C, $A)>; -def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), - (XSNMSUBADP $B, $C, $A)>; - -def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B), - (XVNMSUBADP $B, $C, $A)>; -def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B), - (XVNMSUBADP $B, $C, $A)>; - -def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), - (XVNMSUBASP $B, $C, $A)>; -def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), - (XVNMSUBASP $B, $C, $A)>; +// Additional fnmsub patterns: -a*b + c == -(a*b - c) +def : Pat<(fma (fneg f64:$A), f64:$B, f64:$C), + (XSNMSUBADP $C, $A, $B)>; +def : Pat<(fma f64:$A, (fneg f64:$B), f64:$C), + (XSNMSUBADP $C, $A, $B)>; + +def : Pat<(fma (fneg v2f64:$A), v2f64:$B, v2f64:$C), + (XVNMSUBADP $C, $A, $B)>; +def : Pat<(fma v2f64:$A, (fneg v2f64:$B), v2f64:$C), + (XVNMSUBADP $C, $A, $B)>; + +def : Pat<(fma (fneg v4f32:$A), v4f32:$B, v4f32:$C), + (XVNMSUBASP $C, $A, $B)>; +def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C), + (XVNMSUBASP $C, $A, $B)>; def : Pat<(v2f64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; @@ -2656,9 +2656,9 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (fneg (int_ppc_fmaf128_round_to_odd f128:$vA, f128:$vB, (fneg f128:$vTi))))]>; - // Additional fnmsub patterns: -a*c + b == -(a*c - b) - def : Pat<(fma (fneg f128:$A), f128:$C, f128:$B), (XSNMSUBQP $B, $C, $A)>; - def : Pat<(fma f128:$A, (fneg f128:$C), f128:$B), (XSNMSUBQP $B, $C, $A)>; + // Additional fnmsub patterns: -a*b + c == -(a*b - c) + def : Pat<(fma (fneg f128:$A), f128:$B, f128:$C), (XSNMSUBQP $C, $A, $B)>; + def : Pat<(fma f128:$A, (fneg f128:$B), f128:$C), (XSNMSUBQP $C, $A, $B)>; //===--------------------------------------------------------------------===// // Quad/Double-Precision Compare Instructions: diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 32b9818e70b4a7..b6496f189a3aeb 100644 --- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -79,7 +79,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, } static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, - AsmPrinter &Printer, bool isDarwin) { + AsmPrinter &Printer, bool IsDarwin) { MCContext &Ctx = Printer.OutContext; MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; @@ -137,10 +137,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, // Add ha16() / lo16() markers if required. switch (access) { case PPCII::MO_LO: - Expr = PPCMCExpr::createLo(Expr, isDarwin, Ctx); + Expr = PPCMCExpr::createLo(Expr, IsDarwin, Ctx); break; case PPCII::MO_HA: - Expr = PPCMCExpr::createHa(Expr, isDarwin, Ctx); + Expr = PPCMCExpr::createHa(Expr, IsDarwin, Ctx); break; } @@ -148,20 +148,20 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, } void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, - AsmPrinter &AP, bool isDarwin) { + AsmPrinter &AP, bool IsDarwin) { OutMI.setOpcode(MI->getOpcode()); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MCOperand MCOp; if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP, - isDarwin)) + IsDarwin)) OutMI.addOperand(MCOp); } } bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, MCOperand &OutMO, AsmPrinter &AP, - bool isDarwin) { + bool IsDarwin) { switch (MO.getType()) { default: llvm_unreachable("unknown operand type"); @@ -181,20 +181,20 @@ bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO, return true; case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ExternalSymbol: - OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin); + OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, IsDarwin); return true; case MachineOperand::MO_JumpTableIndex: - OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin); + OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, IsDarwin); return true; case MachineOperand::MO_ConstantPoolIndex: - OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin); + OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, IsDarwin); return true; case MachineOperand::MO_BlockAddress: OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP, - isDarwin); + IsDarwin); return true; case MachineOperand::MO_MCSymbol: - OutMO = GetSymbolRef(MO, MO.getMCSymbol(), AP, isDarwin); + OutMO = GetSymbolRef(MO, MO.getMCSymbol(), AP, IsDarwin); return true; case MachineOperand::MO_RegisterMask: return false; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 7d48634f206813..6b6f62e18ce963 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -131,6 +131,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, if (StackSize == 0 && !MFI.adjustsStack()) return; + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + // Split the SP adjustment to reduce the offsets of callee saved spill. + if (FirstSPAdjustAmount) + StackSize = FirstSPAdjustAmount; + // Allocate space on the stack if necessary. adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); @@ -170,7 +175,23 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, nullptr, RI->getDwarfRegNum(FPReg, true), 0)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); + } + // Emit the second SP adjustment after saving callee saved registers. + if (FirstSPAdjustAmount) { + uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; + assert(SecondSPAdjustAmount > 0 && + "SecondSPAdjustAmount should be greater than zero"); + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount, + MachineInstr::FrameSetup); + // Emit ".cfi_def_cfa_offset StackSize" + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize())); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (hasFP(MF)) { // Realign Stack const RISCVRegisterInfo *RI = STI.getRegisterInfo(); if (RI->needsStackRealignment(MF)) { @@ -224,6 +245,24 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, MachineInstr::FrameDestroy); } + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + if (FirstSPAdjustAmount) { + uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; + assert(SecondSPAdjustAmount > 0 && + "SecondSPAdjustAmount should be greater than zero"); + + adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, SecondSPAdjustAmount, + MachineInstr::FrameDestroy); + + // Emit ".cfi_def_cfa_offset FirstSPAdjustAmount" + unsigned CFIIndex = + MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, + -FirstSPAdjustAmount)); + BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + if (hasFP(MF)) { // To find the instruction restoring FP from stack. for (auto &I = LastFrameDestroy; I != MBBI; ++I) { @@ -256,6 +295,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, .addCFIIndex(CFIIndex); } + if (FirstSPAdjustAmount) + StackSize = FirstSPAdjustAmount; + // Deallocate stack adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); @@ -284,6 +326,8 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + MFI.getOffsetAdjustment(); + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + if (CSI.size()) { MinCSFI = CSI[0].getFrameIdx(); MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); @@ -291,7 +335,11 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, if (FI >= MinCSFI && FI <= MaxCSFI) { FrameReg = RISCV::X2; - Offset += MF.getFrameInfo().getStackSize(); + + if (FirstSPAdjustAmount) + Offset += FirstSPAdjustAmount; + else + Offset += MF.getFrameInfo().getStackSize(); } else if (RI->needsStackRealignment(MF)) { assert(!MFI.hasVarSizedObjects() && "Unexpected combination of stack realignment and varsized objects"); @@ -404,3 +452,39 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( return MBB.erase(MI); } + +// We would like to split the SP adjustment to reduce prologue/epilogue +// as following instructions. In this way, the offset of the callee saved +// register could fit in a single store. +// add sp,sp,-2032 +// sw ra,2028(sp) +// sw s0,2024(sp) +// sw s1,2020(sp) +// sw s3,2012(sp) +// sw s4,2008(sp) +// add sp,sp,-64 +uint64_t +RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const std::vector &CSI = MFI.getCalleeSavedInfo(); + uint64_t StackSize = MFI.getStackSize(); + uint64_t StackAlign = getStackAlignment(); + + // FIXME: Disable SplitSPAdjust if save-restore libcall enabled when the patch + // landing. The callee saved registers will be pushed by the + // save-restore libcalls, so we don't have to split the SP adjustment + // in this case. + // + // Return the FirstSPAdjustAmount if the StackSize can not fit in signed + // 12-bit and there exists a callee saved register need to be pushed. + if (!isInt<12>(StackSize) && (CSI.size() > 0)) { + // FirstSPAdjustAmount is choosed as (2048 - StackAlign) + // because 2048 will cause sp = sp + 2048 in epilogue split into + // multi-instructions. The offset smaller than 2048 can fit in signle + // load/store instruction and we have to stick with the stack alignment. + // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16, + // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment. + return 2048 - StackAlign; + } + return 0; +} diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 9324336c6ef74f..dfc621c8bd070e 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -45,6 +45,11 @@ class RISCVFrameLowering : public TargetFrameLowering { eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + // Get the first stack adjustment amount for SplitSPAdjust. + // Return 0 if we don't want to to split the SP adjustment in prologue and + // epilogue. + uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const; + protected: const RISCVSubtarget &STI; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index a8809a8fbad6b7..fa0050f107b29c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -137,7 +137,8 @@ def uimm8_lsb000 : Operand, } // A 9-bit signed immediate where the least significant bit is zero. -def simm9_lsb0 : Operand { +def simm9_lsb0 : Operand, + ImmLeaf(Imm);}]> { let ParserMatchClass = SImmAsmOperand<9, "Lsb0">; let EncoderMethod = "getImmOpValueAsr1"; let DecoderMethod = "decodeSImmOperandAndLsl1<9>"; @@ -196,7 +197,8 @@ def simm10_lsb0000nonzero : Operand, } // A 12-bit signed immediate where the least significant bit is zero. -def simm12_lsb0 : Operand { +def simm12_lsb0 : Operand, + ImmLeaf(Imm);}]> { let ParserMatchClass = SImmAsmOperand<12, "Lsb0">; let EncoderMethod = "getImmOpValueAsr1"; let DecoderMethod = "decodeSImmOperandAndLsl1<12>"; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 032642942f2b5a..3b73c865ea1702 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -227,6 +227,12 @@ def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>; def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>; def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>; +// frsr, fssr are obsolete aliases replaced by frcsr, fscsr, so give them +// zero weight. +def : InstAlias<"frsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 0>; +def : InstAlias<"fssr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs), 0>; +def : InstAlias<"fssr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 0>; + def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>; def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>; def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index b0c03c13fe60c2..f83a8a984ae0dd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -54,6 +54,12 @@ class WebAssemblyDAGToDAGISel final : public SelectionDAGISel { ForCodeSize = MF.getFunction().hasOptSize(); Subtarget = &MF.getSubtarget(); + + // Wasm64 is not fully supported right now (and is not specified) + if (Subtarget->hasAddr64()) + report_fatal_error( + "64-bit WebAssembly (wasm64) is not currently supported"); + return SelectionDAGISel::runOnMachineFunction(MF); } diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp index f66c6eb4ec1602..543dc8b00fa058 100644 --- a/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass { /// - LEA that uses RIP relative addressing mode /// - LEA that uses 16-bit addressing mode " /// This function currently handles the first 2 cases only. - MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB); + void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec); /// Look for LEAs that are really two address LEAs that we might be able to /// turn into regular ADD instructions. @@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) continue; - if (IsSlowLEA) { + if (IsSlowLEA) processInstructionForSlowLEA(I, MBB); - } else if (IsSlow3OpsLEA) { - if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) { - MBB.erase(I); - I = NewMI; - } - } + else if (IsSlow3OpsLEA) + processInstrForSlow3OpLEA(I, MBB, OptIncDec); } // Second pass for creating LEAs. This may reverse some of the @@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) { Reg == X86::R13D || Reg == X86::R13; } -static inline bool isRegOperand(const MachineOperand &Op) { - return Op.isReg() && Op.getReg() != X86::NoRegister; -} - /// Returns true if this LEA uses base an index registers, and the base register /// is known to be inefficient for the subtarget. // TODO: use a variant scheduling class to model the latency profile // of LEA instructions, and implement this logic as a scheduling predicate. static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, const MachineOperand &Index) { - return Base.isReg() && isInefficientLEAReg(Base.getReg()) && - isRegOperand(Index); + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() && + Index.getReg() != X86::NoRegister; } static inline bool hasLEAOffset(const MachineOperand &Offset) { @@ -534,112 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, } } -MachineInstr * -FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB) { +void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, + bool OptIncDec) { + MachineInstr &MI = *I; const unsigned LEAOpcode = MI.getOpcode(); - const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt); const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg); const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp); const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); - if (!(TII->isThreeOperandsLEA(MI) || - hasInefficientLEABaseReg(Base, Index)) || + if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) || !TII->isSafeToClobberEFLAGS(MBB, MI) || Segment.getReg() != X86::NoRegister) - return nullptr; + return; + + Register DestReg = Dest.getReg(); + Register BaseReg = Base.getReg(); + Register IndexReg = Index.getReg(); + + if (MI.getOpcode() == X86::LEA64_32r) { + if (BaseReg != 0) + BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit); + if (IndexReg != 0) + IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit); + } - Register DstR = Dst.getReg(); - Register BaseR = Base.getReg(); - Register IndexR = Index.getReg(); - Register SSDstR = - (LEAOpcode == X86::LEA64_32r) ? Register(getX86SubSuperRegister(DstR, 64)) - : DstR; bool IsScale1 = Scale.getImm() == 1; - bool IsInefficientBase = isInefficientLEAReg(BaseR); - bool IsInefficientIndex = isInefficientLEAReg(IndexR); + bool IsInefficientBase = isInefficientLEAReg(BaseReg); + bool IsInefficientIndex = isInefficientLEAReg(IndexReg); // Skip these cases since it takes more than 2 instructions // to replace the LEA instruction. - if (IsInefficientBase && SSDstR == BaseR && !IsScale1) - return nullptr; - if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && - (IsInefficientIndex || !IsScale1)) - return nullptr; - - const DebugLoc DL = MI.getDebugLoc(); - const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); - const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + if (IsInefficientBase && DestReg == BaseReg && !IsScale1) + return; LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";); + MachineInstr *NewMI = nullptr; + // First try to replace LEA with one or two (for the 3-op LEA case) // add instructions: // 1.lea (%base,%index,1), %base => add %index,%base // 2.lea (%base,%index,1), %index => add %base,%index - if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { - const MachineOperand &Src = DstR == BaseR ? Index : Base; - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); - LLVM_DEBUG(NewMI->dump();); - // Create ADD instruction for the Offset in case of 3-Ops LEA. - if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) { + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + if (DestReg != BaseReg) + std::swap(BaseReg, IndexReg); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg) + .addReg(Base.getReg(), RegState::Implicit) + .addReg(Index.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg); } - return NewMI; - } - // If the base is inefficient try switching the index and base operands, - // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: - // lea offset(%base,%index,scale),%dst => - // lea (%base,%index,scale); add offset,%dst - if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .add(IsInefficientBase ? Index : Base) - .add(Scale) - .add(IsInefficientBase ? Base : Index) - .addImm(0) - .add(Segment); + } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); LLVM_DEBUG(NewMI->dump();); + } + + // If either replacement succeeded above, add the offset if needed, then + // replace the instruction. + if (NewMI) { // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (OptIncDec && Offset.isImm() && + (Offset.getImm() == 1 || Offset.getImm() == -1)) { + unsigned NewOpc = + getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg); + LLVM_DEBUG(NewMI->dump();); + } else { + unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Offset); + LLVM_DEBUG(NewMI->dump();); + } } - return NewMI; + + MBB.erase(I); + I = NewMI; + return; } + // Handle the rest of the cases with inefficient base register: - assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!"); assert(IsInefficientBase && "efficient base should be handled already!"); + // FIXME: Handle LEA64_32r. + if (LEAOpcode == X86::LEA64_32r) + return; + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst if (IsScale1 && !hasLEAOffset(Offset)) { - bool BIK = Base.isKill() && BaseR != IndexR; - TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK); + bool BIK = Base.isKill() && BaseReg != IndexReg; + TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK); LLVM_DEBUG(MI.getPrevNode()->dump();); - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Index); LLVM_DEBUG(NewMI->dump();); - return NewMI; + return; } + // lea offset(%base,%index,scale), %dst => // lea offset( ,%index,scale), %dst; add %base,%dst - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .addReg(0) - .add(Scale) - .add(Index) - .add(Offset) - .add(Segment); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); LLVM_DEBUG(NewMI->dump();); - NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Base); LLVM_DEBUG(NewMI->dump();); - return NewMI; + + MBB.erase(I); + I = NewMI; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8c837dfb6af5c8..3806b0e2330174 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1763,6 +1763,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } // We want to custom lower some of our intrinsics. @@ -5769,23 +5770,35 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Move the current value of the bit to be replace to the lsbs. - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getTargetConstant(IdxVal, dl, MVT::i8)); - // Xor with the new bit. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); - // Shift to MSB, filling bottom bits with 0. + + // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; - Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); - // Shift to the final position, filling upper bits with 0. + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); - // Xor with original vector leaving the new value. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + + // Isolate the bits below the insertion point. + unsigned LowShift = NumElems - IdxVal; + SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + + // Isolate the bits after the last inserted bit. + unsigned HighShift = IdxVal + SubVecNumElems; + SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + + // Now OR all 3 pieces together. + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); + SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); + // Reduce to original width if needed. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, @@ -6713,14 +6726,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return true; } -/// Check a target shuffle mask's inputs to see if we can set any values to -/// SM_SentinelZero - this is for elements that are known to be zero -/// (not just zeroable) from their inputs. +/// Decode a target shuffle mask and inputs and see if any values are +/// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. -static bool setTargetShuffleZeroElements(SDValue N, - SmallVectorImpl &Mask, +static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl &Mask, SmallVectorImpl &Ops, - bool ResolveZero = true) { + APInt &KnownUndef, APInt &KnownZero) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; @@ -6729,15 +6740,17 @@ static bool setTargetShuffleZeroElements(SDValue N, if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) return false; + int Size = Mask.size(); SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; + KnownUndef = KnownZero = APInt::getNullValue(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Mask.size()) == 0 && "Illegal split of shuffle value type"); - unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); + unsigned EltSizeInBits = VT.getSizeInBits() / Size; // Extract known constant input data. APInt UndefSrcElts[2]; @@ -6748,12 +6761,18 @@ static bool setTargetShuffleZeroElements(SDValue N, getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], true, false)}; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { + for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. - if (M < 0) + if (M < 0) { + assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!"); + if (SM_SentinelUndef == M) + KnownUndef.setBit(i); + if (SM_SentinelZero == M) + KnownZero.setBit(i); continue; + } // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; @@ -6762,7 +6781,7 @@ static bool setTargetShuffleZeroElements(SDValue N, // We are referencing an UNDEF input. if (V.isUndef()) { - Mask[i] = SM_SentinelUndef; + KnownUndef.setBit(i); continue; } @@ -6775,33 +6794,32 @@ static bool setTargetShuffleZeroElements(SDValue N, int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) - Mask[i] = SM_SentinelUndef; - else if (ResolveZero && Idx == 0 && X86::isZeroNode(V.getOperand(0))) - Mask[i] = SM_SentinelZero; + KnownUndef.setBit(i); + else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) + KnownZero.setBit(i); continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) - Mask[i] = SM_SentinelUndef; - else if (ResolveZero && SrcEltBits[SrcIdx][M] == 0) - Mask[i] = SM_SentinelZero; + KnownUndef.setBit(i); + else if (SrcEltBits[SrcIdx][M] == 0) + KnownZero.setBit(i); } } - assert(VT.getVectorNumElements() == Mask.size() && + assert(VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"); return true; } // Forward declaration (for getFauxShuffleMask recursive check). // TODO: Use DemandedElts variant. -static bool resolveTargetShuffleInputs(SDValue Op, - SmallVectorImpl &Inputs, - SmallVectorImpl &Mask, - SelectionDAG &DAG, unsigned Depth, - bool ResolveZero); +static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, + SmallVectorImpl &Mask, + SelectionDAG &DAG, unsigned Depth, + bool ResolveZero); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the @@ -6904,10 +6922,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, return false; SmallVector SrcMask0, SrcMask1; SmallVector SrcInputs0, SrcInputs1; - if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1, - ResolveZero) || - !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1, - ResolveZero)) + if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1, + ResolveZero) || + !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1, + ResolveZero)) return false; int MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector Mask0, Mask1; @@ -6956,8 +6974,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector SubMask; SmallVector SubInputs; - if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, - SubMask, DAG, Depth + 1, ResolveZero)) + if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, + SubMask, DAG, Depth + 1, ResolveZero)) return false; if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || @@ -7233,49 +7251,47 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, Inputs = UsedInputs; } -/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs -/// and set the SM_SentinelUndef and SM_SentinelZero values. +/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs +/// and then sets the SM_SentinelUndef and SM_SentinelZero values. /// Returns true if the target shuffle mask was decoded. static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, SelectionDAG &DAG, unsigned Depth, bool ResolveZero) { - if (!setTargetShuffleZeroElements(Op, Mask, Inputs, ResolveZero)) - if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, - ResolveZero)) - return false; - return true; -} - -/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs -/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the -/// remaining input indices in case we now have a unary shuffle and adjust the -/// inputs accordingly. -/// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, - SmallVectorImpl &Inputs, - SmallVectorImpl &Mask, - SelectionDAG &DAG, unsigned Depth, - bool ResolveZero) { - if (!setTargetShuffleZeroElements(Op, Mask, Inputs, ResolveZero)) - if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, - ResolveZero)) - return false; + EVT VT = Op.getValueType(); + if (!VT.isSimple() || !VT.isVector()) + return false; - resolveTargetShuffleInputsAndMask(Inputs, Mask); - return true; + APInt KnownUndef, KnownZero; + if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { + for (int i = 0, e = Mask.size(); i != e; ++i) { + int &M = Mask[i]; + if (M < 0) + continue; + if (KnownUndef[i]) + M = SM_SentinelUndef; + else if (ResolveZero && KnownZero[i]) + M = SM_SentinelZero; + } + return true; + } + return getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, + ResolveZero); } -static bool resolveTargetShuffleInputs(SDValue Op, - SmallVectorImpl &Inputs, - SmallVectorImpl &Mask, - SelectionDAG &DAG, unsigned Depth, - bool ResolveZero = true) { +static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, + SmallVectorImpl &Mask, + SelectionDAG &DAG, unsigned Depth = 0, + bool ResolveZero = true) { + EVT VT = Op.getValueType(); + if (!VT.isSimple() || !VT.isVector()) + return false; + unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt DemandedElts = APInt::getAllOnesValue(NumElts); - return resolveTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth, - ResolveZero); + return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth, + ResolveZero); } /// Returns the scalar element that will make up the ith @@ -10286,24 +10302,6 @@ static bool isTargetShuffleEquivalent(ArrayRef Mask, return true; } -// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle -// mask. -static SmallVector createTargetShuffleMask(ArrayRef Mask, - const APInt &Zeroable) { - int NumElts = Mask.size(); - assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); - - SmallVector TargetMask(NumElts, SM_SentinelUndef); - for (int i = 0; i != NumElts; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); - TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); - } - return TargetMask; -} - // Attempt to create a shuffle mask from a VSELECT condition mask. static bool createShuffleMaskFromVSELECT(SmallVectorImpl &Mask, SDValue Cond) { @@ -10954,9 +10952,9 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SelectionDAG &DAG); static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, - MutableArrayRef TargetMask, - bool &ForceV1Zero, bool &ForceV2Zero, - uint64_t &BlendMask) { + MutableArrayRef Mask, + const APInt &Zeroable, bool &ForceV1Zero, + bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = @@ -10964,13 +10962,12 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; - assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); + assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. - // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. - for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { - int M = TargetMask[i]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; if (M == SM_SentinelUndef) continue; if (M == i) @@ -10979,16 +10976,16 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, BlendMask |= 1ull << i; continue; } - if (M == SM_SentinelZero) { + if (Zeroable[i]) { if (V1IsZeroOrUndef) { ForceV1Zero = true; - TargetMask[i] = i; + Mask[i] = i; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; BlendMask |= 1ull << i; - TargetMask[i] = i + Size; + Mask[i] = i + Size; continue; } } @@ -11017,11 +11014,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SmallVector Mask = createTargetShuffleMask(Original, Zeroable); - uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; - if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, + SmallVector Mask(Original.begin(), Original.end()); + if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); @@ -14894,26 +14890,26 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); - int Size = Mask.size(); + int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; - int LaneSize = 128 / VT.getScalarSizeInBits(); - SmallVector RepeatMask(LaneSize, -1); + int NumLaneElts = 128 / VT.getScalarSizeInBits(); + SmallVector RepeatMask(NumLaneElts, -1); SmallVector, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. for (int Lane = 0; Lane != NumLanes; ++Lane) { - int Srcs[2] = { -1, -1 }; - SmallVector InLaneMask(LaneSize, -1); - for (int i = 0; i != LaneSize; ++i) { - int M = Mask[(Lane * LaneSize) + i]; + int Srcs[2] = {-1, -1}; + SmallVector InLaneMask(NumLaneElts, -1); + for (int i = 0; i != NumLaneElts; ++i) { + int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. - int LaneSrc = M / LaneSize; + int LaneSrc = M / NumLaneElts; int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; @@ -14923,7 +14919,7 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( return SDValue(); Srcs[Src] = LaneSrc; - InLaneMask[i] = (M % LaneSize) + Src * Size; + InLaneMask[i] = (M % NumLaneElts) + Src * NumElts; } // If this lane has two sources, see if it fits with the repeat mask so far. @@ -14979,23 +14975,23 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( if (LaneSrcs[Lane][0] >= 0) continue; - for (int i = 0; i != LaneSize; ++i) { - int M = Mask[(Lane * LaneSize) + i]; + for (int i = 0; i != NumLaneElts; ++i) { + int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // If RepeatMask isn't defined yet we can define it ourself. if (RepeatMask[i] < 0) - RepeatMask[i] = M % LaneSize; + RepeatMask[i] = M % NumLaneElts; - if (RepeatMask[i] < Size) { - if (RepeatMask[i] != M % LaneSize) + if (RepeatMask[i] < NumElts) { + if (RepeatMask[i] != M % NumLaneElts) return SDValue(); - LaneSrcs[Lane][0] = M / LaneSize; + LaneSrcs[Lane][0] = M / NumLaneElts; } else { - if (RepeatMask[i] != ((M % LaneSize) + Size)) + if (RepeatMask[i] != ((M % NumLaneElts) + NumElts)) return SDValue(); - LaneSrcs[Lane][1] = M / LaneSize; + LaneSrcs[Lane][1] = M / NumLaneElts; } } @@ -15003,14 +14999,14 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( return SDValue(); } - SmallVector NewMask(Size, -1); + SmallVector NewMask(NumElts, -1); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][0]; - for (int i = 0; i != LaneSize; ++i) { + for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) - M = Src * LaneSize + i; - NewMask[Lane * LaneSize + i] = M; + M = Src * NumLaneElts + i; + NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); @@ -15023,11 +15019,11 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][1]; - for (int i = 0; i != LaneSize; ++i) { + for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) - M = Src * LaneSize + i; - NewMask[Lane * LaneSize + i] = M; + M = Src * NumLaneElts + i; + NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); @@ -15038,12 +15034,12 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( cast(NewV2)->getMask() == Mask) return SDValue(); - for (int i = 0; i != Size; ++i) { - NewMask[i] = RepeatMask[i % LaneSize]; + for (int i = 0; i != NumElts; ++i) { + NewMask[i] = RepeatMask[i % NumLaneElts]; if (NewMask[i] < 0) continue; - NewMask[i] += (i / LaneSize) * LaneSize; + NewMask[i] += (i / NumLaneElts) * NumLaneElts; } return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } @@ -15440,7 +15436,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, - unsigned &ShuffleImm, ArrayRef Mask) { + unsigned &ShuffleImm, ArrayRef Mask, + const APInt &Zeroable) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && @@ -15450,7 +15447,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool ZeroLane[2] = { true, true }; for (int i = 0; i < NumElts; ++i) - ZeroLane[i & 1] &= isUndefOrZero(Mask[i]); + ZeroLane[i & 1] &= Zeroable[i]; // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. @@ -15483,19 +15480,17 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, } static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Original, + SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); - SmallVector Mask = createTargetShuffleMask(Original, Zeroable); - unsigned Immediate = 0; bool ForceV1Zero = false, ForceV2Zero = false; if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, - Mask)) + Mask, Zeroable)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. @@ -15508,6 +15503,42 @@ static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, DAG.getTargetConstant(Immediate, DL, MVT::i8)); } +// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed +// by zeroable elements in the remaining 24 elements. Turn this into two +// vmovqb instructions shuffled together. +static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + assert(VT == MVT::v32i8 && "Unexpected type!"); + + // The first 8 indices should be every 8th element. + if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) + return SDValue(); + + // Remaining elements need to be zeroable. + if (Zeroable.countLeadingOnes() < (Mask.size() - 8)) + return SDValue(); + + V1 = DAG.getBitcast(MVT::v4i64, V1); + V2 = DAG.getBitcast(MVT::v4i64, V2); + + V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); + V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); + + // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in + // the upper bits of the result using an unpckldq. + SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, + { 0, 1, 2, 3, 16, 17, 18, 19, + 4, 5, 6, 7, 20, 21, 22, 23 }); + // Insert the unpckldq into a zero vector to widen to v32i8. + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, + DAG.getConstant(0, DL, MVT::v32i8), Unpack, + DAG.getIntPtrConstant(0, DL)); +} + + /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -16108,6 +16139,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; + // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed + // by zeroable elements in the remaining 24 elements. Turn this into two + // vmovqb instructions shuffled together. + if (Subtarget.hasVLX()) + if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, + Mask, Zeroable, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); @@ -19295,9 +19334,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // If called by the legalizer just return. - if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) { - if ((InVT == MVT::v8i64 || InVT == MVT::v16i32) && VT.is128BitVector()) { + // If we're called by the type legalizer, handle a few cases. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(InVT)) { + if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && + VT.is128BitVector()) { assert(Subtarget.hasVLX() && "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then // truncate the remainder. We'd rather produce two 64-bit results and @@ -27924,6 +27965,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } } + if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 && + getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector && + isTypeLegal(MVT::v4i64)) { + // Input needs to be split and output needs to widened. Let's use two + // VTRUNCs, and shuffle their results together into the wider type. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, dl); + + Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo); + Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi); + SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi, + { 0, 1, 2, 3, 16, 17, 18, 19, + -1, -1, -1, -1, -1, -1, -1, -1 }); + Results.push_back(Res); + return; + } + return; } case ISD::ANY_EXTEND: @@ -32043,8 +32101,8 @@ static bool matchBinaryPermuteShuffle( uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector TargetMask(Mask.begin(), Mask.end()); - if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, - BlendMask)) { + if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero, + ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector RepeatedMask; @@ -32091,7 +32149,7 @@ static bool matchBinaryPermuteShuffle( (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { bool ForceV1Zero = false, ForceV2Zero = false; if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, - PermuteImm, Mask)) { + PermuteImm, Mask, Zeroable)) { V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::SHUFP; @@ -32936,7 +32994,7 @@ static SDValue combineX86ShufflesRecursively( // Extract target shuffle mask and resolve sentinels and inputs. SmallVector OpMask; SmallVector OpInputs; - if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG, Depth)) + if (!getTargetShuffleInputs(Op, OpInputs, OpMask, DAG, Depth)) return SDValue(); // Add the inputs to the Ops list, avoiding duplicates. @@ -33039,6 +33097,9 @@ static SDValue combineX86ShufflesRecursively( Mask[i] = OpMaskedIdx; } + // Remove unused/repeated shuffle source ops. + resolveTargetShuffleInputsAndMask(Ops, Mask); + // Handle the all undef/zero cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) return DAG.getUNDEF(Root.getValueType()); @@ -33050,10 +33111,7 @@ static SDValue combineX86ShufflesRecursively( return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); - // Remove unused/repeated shuffle source ops. - resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); - HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); // Update the list of shuffle nodes that have been combined so far. @@ -33379,8 +33437,19 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // Share broadcast with the longest vector and extract low subvector (free). + for (SDNode *User : Src->uses()) + if (User != N.getNode() && + (User->getOpcode() == X86ISD::VBROADCAST || + User->getOpcode() == X86ISD::VBROADCAST_LOAD) && + User->getValueSizeInBits(0) > VT.getSizeInBits()) { + return extractSubVector(SDValue(User, 0), 0, DAG, DL, + VT.getSizeInBits()); + } + // vbroadcast(scalarload X) -> vbroadcast_load X - if (!SrcVT.isVector() && Src.hasOneUse() && + // For float loads, extract other uses of the scalar from the broadcast. + if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) && ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast(Src); SDVTList Tys = DAG.getVTList(VT, MVT::Other); @@ -33388,17 +33457,19 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); - return BcastLd; - } - - // Share broadcast with the longest vector and extract low subvector (free). - for (SDNode *User : Src->uses()) - if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && - User->getValueSizeInBits(0) > VT.getSizeInBits()) { - return extractSubVector(SDValue(User, 0), 0, DAG, DL, - VT.getSizeInBits()); + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceExtract = Src.hasOneUse(); + DCI.CombineTo(N.getNode(), BcastLd); + if (NoReplaceExtract) { + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + } else { + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd, + DAG.getIntPtrConstant(0, DL)); + DCI.CombineTo(LN, Scl, BcastLd.getValue(1)); } + return N; // Return N so it doesn't get rechecked! + } return SDValue(); } @@ -33498,15 +33569,17 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector TargetMask1; SmallVector Ops1; - if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { - int M = TargetMask1[SrcIdx]; - if (isUndefOrZero(M)) { + APInt KnownUndef1, KnownZero1; + if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1, + KnownZero1)) { + if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) { // Zero/UNDEF insertion - zero out element and remove dependency. InsertPSMask |= (1u << DstIdx); return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // Update insertps mask srcidx and reference the source input directly. + int M = TargetMask1[SrcIdx]; assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); Op1 = Ops1[M < 4 ? 0 : 1]; @@ -33517,16 +33590,17 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector TargetMask0; SmallVector Ops0; - if (setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) { + APInt KnownUndef0, KnownZero0; + if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0, + KnownZero0)) { bool Updated = false; bool UseInput00 = false; bool UseInput01 = false; for (int i = 0; i != 4; ++i) { - int M = TargetMask0[i]; if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; - } else if (isUndefOrZero(M)) { + } else if (KnownUndef0[i] || KnownZero0[i]) { // If the target mask is undef/zero then we must zero the element. InsertPSMask |= (1u << i); Updated = true; @@ -33534,6 +33608,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } // The input vector element must be inline. + int M = TargetMask0[i]; if (M != i && M != (i + 4)) return SDValue(); @@ -34507,8 +34582,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Get target/faux shuffle mask. SmallVector OpMask; SmallVector OpInputs; - if (!VT.isSimple() || !getTargetShuffleInputs(Op, DemandedElts, OpInputs, - OpMask, TLO.DAG, Depth, false)) + if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, TLO.DAG, + Depth, false)) return false; // Shuffle inputs must be the same size as the result. @@ -34887,8 +34962,7 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( SmallVector ShuffleMask; SmallVector ShuffleOps; - if (VT.isSimple() && VT.isVector() && - resolveTargetShuffleInputs(Op, ShuffleOps, ShuffleMask, DAG, Depth)) { + if (getTargetShuffleInputs(Op, ShuffleOps, ShuffleMask, DAG, Depth)) { // If all the demanded elts are from one operand and are inline, // then we can use the operand directly. int NumOps = ShuffleOps.size(); @@ -35065,6 +35139,23 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { return false; } +// Helper to push sign extension of vXi1 SETCC result through bitops. +static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, + SDValue Src, const SDLoc &DL) { + switch (Src.getOpcode()) { + case ISD::SETCC: + return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); + case ISD::AND: + case ISD::XOR: + case ISD::OR: + return DAG.getNode( + Src.getOpcode(), DL, SExtVT, + signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL), + signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL)); + } + llvm_unreachable("Unexpected node type for vXi1 sign extension"); +} + // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> @@ -35103,6 +35194,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; + bool PropagateSExt = false; switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); @@ -35113,8 +35205,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) + if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) { SExtVT = MVT::v4i64; + PropagateSExt = true; + } break; case MVT::v8i1: SExtVT = MVT::v8i16; @@ -35123,11 +35217,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - // TODO : use checkBitcastSrcVectorSize - if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (Src.getOperand(0).getValueType().is256BitVector() || - Src.getOperand(0).getValueType().is512BitVector())) { + if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) || + checkBitcastSrcVectorSize(Src, 512))) { SExtVT = MVT::v8i32; + PropagateSExt = true; } break; case MVT::v16i1: @@ -35150,7 +35243,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, return SDValue(); }; - SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); + SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) + : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) { V = getPMOVMSKB(DL, V, DAG, Subtarget); @@ -35403,6 +35497,21 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, DAG.getBitcast(MVT::i16, N0.getOperand(0))); + // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT + // determines // the number of bits loaded. Remaining bits are zero. + if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && + VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) { + auto *BCast = cast(N0); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + VT.getVectorElementType(), + BCast->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); + return ResNode; + } + // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -35932,7 +36041,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, // Resolve the target shuffle inputs and mask. SmallVector Mask; SmallVector Ops; - if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG, 0)) + if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. @@ -41074,6 +41183,19 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return combineVectorTruncation(N, DAG, Subtarget); } +static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + SDLoc DL(N); + + if (auto SSatVal = detectSSatPattern(In, VT)) + return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); + if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + + return SDValue(); +} + /// Returns the negated value if the node \p N flips sign of FP value. /// /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000) @@ -42250,10 +42372,12 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX2()) || (OpSize == 512 && Subtarget.useAVX512Regs())) { - EVT VecVT = OpSize == 512 ? MVT::v16i32 : + auto BW = Subtarget.hasBWI(); + EVT VecVT = OpSize == 512 ? (BW ? MVT::v64i8 : MVT::v16i32) : OpSize == 256 ? MVT::v32i8 : MVT::v16i8; - EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; + EVT CmpVT = OpSize == 512 ? (BW ? MVT::v64i1 : MVT::v16i1) : VecVT; + SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -42273,6 +42397,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } // For 512-bits we want to emit a setcc that will lower to kortest. + if (OpSize == 512 && BW) + return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i64, Cmp), + DAG.getConstant(0xFFFFFFFFFFFFFFFF, DL, MVT::i64), CC); if (OpSize == 512) return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), DAG.getConstant(0xFFFF, DL, MVT::i16), CC); @@ -44088,12 +44215,15 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return SDValue(); MVT VT = N->getSimpleValueType(0); - EVT WideVecVT = N->getOperand(0).getValueType(); - SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); + SDValue InVec = N->getOperand(0); + SDValue InVecBC = peekThroughBitcasts(InVec); + EVT InVecVT = InVec.getValueType(); + EVT InVecBCVT = InVecBC.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && - TLI.isTypeLegal(WideVecVT) && - WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) { + TLI.isTypeLegal(InVecVT) && + InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) { auto isConcatenatedNot = [] (SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) @@ -44101,12 +44231,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue NotOp = V->getOperand(0); return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS; }; - if (isConcatenatedNot(WideVec.getOperand(0)) || - isConcatenatedNot(WideVec.getOperand(1))) { + if (isConcatenatedNot(InVecBC.getOperand(0)) || + isConcatenatedNot(InVecBC.getOperand(1))) { // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 - SDValue Concat = split256IntArith(WideVec, DAG); + SDValue Concat = split256IntArith(InVecBC, DAG); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, - DAG.getBitcast(WideVecVT, Concat), N->getOperand(1)); + DAG.getBitcast(InVecVT, Concat), N->getOperand(1)); } } @@ -44116,7 +44246,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (SDValue V = narrowExtractedVectorSelect(N, DAG)) return V; - SDValue InVec = N->getOperand(0); unsigned IdxVal = cast(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) @@ -44136,25 +44265,22 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR - if (InVec.getOpcode() == ISD::BITCAST && - InVec.getOperand(0).getValueType().isVector()) { - SDValue SrcOp = InVec.getOperand(0); - EVT SrcVT = SrcOp.getValueType(); - unsigned SrcNumElts = SrcVT.getVectorNumElements(); - unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); + if (InVec != InVecBC && InVecBCVT.isVector()) { + unsigned SrcNumElts = InVecBCVT.getVectorNumElements(); + unsigned DestNumElts = InVecVT.getVectorNumElements(); if ((DestNumElts % SrcNumElts) == 0) { unsigned DestSrcRatio = DestNumElts / SrcNumElts; if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), - SrcVT.getScalarType(), NewExtNumElts); + InVecBCVT.getScalarType(), NewExtNumElts); if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; SDLoc DL(N); SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, - SrcOp, NewIndex); + InVecBC, NewIndex); return DAG.getBitcast(VT, NewExtract); } } @@ -44200,7 +44326,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { unsigned InOpcode = InVec.getOpcode(); - if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { + if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { @@ -44432,6 +44558,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); + case X86ISD::VTRUNC: return combineVTRUNC(N, DAG); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 2d3b8a556816ab..9b5de59430a521 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2145,93 +2145,82 @@ let Predicates = [HasAVX512] in { SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; } -multiclass avx512_icmp_packed opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86FoldableSchedWrite sched, +multiclass avx512_icmp_packed opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> { - let isCommutable = IsCommutable in + let isCommutable = IsCommutable, hasSideEffects = 0 in def rr : AVX512BI, - EVEX_4V, Sched<[sched]>; + []>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rm : AVX512BI, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isCommutable = IsCommutable in + []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; + let isCommutable = IsCommutable, hasSideEffects = 0 in def rrk : AVX512BI, - EVEX_4V, EVEX_K, Sched<[sched]>; + []>, EVEX_4V, EVEX_K, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rmk : AVX512BI, - EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, +multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed { + avx512_icmp_packed { + let mayLoad = 1, hasSideEffects = 0 in { def rmb : AVX512BI, - EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512BI, - EVEX_4V, EVEX_K, EVEX_B, + []>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } -multiclass avx512_icmp_packed_vl opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86SchedWriteWidths sched, +multiclass avx512_icmp_packed_vl opc, string OpcodeStr, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed, EVEX_V256; - defm Z128 : avx512_icmp_packed, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, - PatFrag OpNode, PatFrag OpNode_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb, EVEX_V128; } } @@ -2239,53 +2228,42 @@ multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, // This fragment treats X86cmpm as commutable to help match loads in both // operands for PCMPEQ. def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>; -def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), - (X86setcc_commute node:$src1, node:$src2, SETEQ)>; def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; -def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpeqm_c node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; -def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpgtm node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", SchedWriteVecALU, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -3148,54 +3126,6 @@ multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>; defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; -// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. -multiclass axv512_icmp_packed_no_vlx_lowering { - def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrr") - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; - - def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag_su (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2)))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrrk") - (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; -} - -multiclass axv512_icmp_packed_rmb_no_vlx_lowering { - // Broadcast load. - def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrmb") - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - addr:$src2), - Narrow.KRC)>; - - def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag_su (Narrow.VT Narrow.RC:$src1), - (Narrow.BroadcastLdFrag addr:$src2)))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrmbk") - (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - addr:$src2), - Narrow.KRC)>; -} - // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. multiclass axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - } - defm : axv512_icmp_packed_cc_no_vlx_lowering; defm : axv512_icmp_packed_cc_no_vlx_lowering; @@ -3377,22 +3279,6 @@ let Predicates = [HasAVX512, NoVLX] in { } let Predicates = [HasBWI, NoVLX] in { - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't - // increase the pattern complexity the way an immediate would. - let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - } - defm : axv512_icmp_packed_cc_no_vlx_lowering; defm : axv512_icmp_packed_cc_no_vlx_lowering; @@ -5060,22 +4946,11 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - - def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), - (EXTRACT_SUBREG - (VPMULLQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), - sub_xmm)>; -} - -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasDQI, NoVLX] in { - def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), (EXTRACT_SUBREG - (VPMULLQZrr + (VPMULLQZrmb (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + addr:$src2), sub_ymm)>; def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), @@ -5084,29 +4959,47 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (VPMULLQZrmb + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } -multiclass avx512_min_max_lowering { +multiclass avx512_min_max_lowering { def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; + def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + addr:$src2), + sub_ymm)>; def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } let Predicates = [HasAVX512, NoVLX] in { - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; + defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; + defm : avx512_min_max_lowering<"VPMINUQZ", umin>; + defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; + defm : avx512_min_max_lowering<"VPMINSQZ", smin>; } //===----------------------------------------------------------------------===// @@ -5282,22 +5175,17 @@ multiclass avx512_logical_lowering_bcast { // Register-broadcast logical operations. - def : Pat<(IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT (_.BroadcastLdFrag addr:$src2))))), - (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (_.BroadcastLdFrag addr:$src2)))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.RC:$src0)), (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (_.BroadcastLdFrag addr:$src2)))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.ImmAllZerosV)), (!cast(InstrStr#rmbkz) _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -11488,102 +11376,6 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU, defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, avx512vl_i64_info>, VEX_W; -// Patterns to fold bitcasted FP broadcasts. -// FIXME: Need better DAG canonicalization. -let Predicates = [HasVLX] in { - def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - VR128X:$src2, (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - VR128X:$src2, (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - VR256X:$src2, (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - VR256X:$src2, (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; -} - -let Predicates = [HasAVX512] in { - def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - VR512:$src2, VR512:$src1, (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - VR512:$src2, (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - VR512:$src2, VR512:$src1, (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - VR512:$src2, (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; -} - // Patterns to use VPTERNLOG for vXi16/vXi8 vectors. let Predicates = [HasVLX] in { def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 0726b91c1966cd..78098fd6262f7a 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -535,6 +535,181 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; } + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik: + case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik: + case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmi: case X86::VPCMPBZrmik: + case X86::VPCMPBZrri: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmi: case X86::VPCMPDZrmik: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPDZrri: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmi: case X86::VPCMPQZrmik: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPQZrri: case X86::VPCMPQZrrik: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik: + case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik: + case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmi: case X86::VPCMPWZrmik: + case X86::VPCMPWZrri: case X86::VPCMPWZrrik: { + // Turn immediate 0 into the VPCMPEQ instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + + // Turn immediate 6 into the VPCMPGT instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + + break; + } + // CALL64r, CALL64pcrel32 - These instructions used to have // register inputs modeled as normal uses instead of implicit uses. As such, // they we used to truncate off all but the first operand (the callee). This diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 83c78451253bf6..b634da1d51fbe6 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3394,9 +3394,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { if (IsZeroCmp) { // Only enable vector loads for equality comparison. Right now the vector // version is not as fast for three way compare (see #33329). - // TODO: enable AVX512 when the DAG is ready. - // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); const unsigned PreferredWidth = ST->getPreferVectorWidth(); + if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); // All GPR and vector loads can be unaligned. SIMD compare requires integer diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp index 18ab6637305ef6..286191abff2029 100644 --- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp +++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -13,6 +13,7 @@ #include "llvm/ToolDrivers/llvm-lib/LibDriver.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSet.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Bitcode/BitcodeReader.h" @@ -141,6 +142,125 @@ static void doList(opt::InputArgList& Args) { fatalOpenError(std::move(Err), B->getBufferIdentifier()); } +static COFF::MachineTypes getCOFFFileMachine(MemoryBufferRef MB) { + std::error_code EC; + object::COFFObjectFile Obj(MB, EC); + if (EC) { + llvm::errs() << MB.getBufferIdentifier() + << ": failed to open: " << EC.message() << '\n'; + exit(1); + } + + uint16_t Machine = Obj.getMachine(); + if (Machine != COFF::IMAGE_FILE_MACHINE_I386 && + Machine != COFF::IMAGE_FILE_MACHINE_AMD64 && + Machine != COFF::IMAGE_FILE_MACHINE_ARMNT && + Machine != COFF::IMAGE_FILE_MACHINE_ARM64) { + llvm::errs() << MB.getBufferIdentifier() << ": unknown machine: " << Machine + << '\n'; + exit(1); + } + + return static_cast(Machine); +} + +static COFF::MachineTypes getBitcodeFileMachine(MemoryBufferRef MB) { + Expected TripleStr = getBitcodeTargetTriple(MB); + if (!TripleStr) { + llvm::errs() << MB.getBufferIdentifier() + << ": failed to get target triple from bitcode\n"; + exit(1); + } + + switch (Triple(*TripleStr).getArch()) { + case Triple::x86: + return COFF::IMAGE_FILE_MACHINE_I386; + case Triple::x86_64: + return COFF::IMAGE_FILE_MACHINE_AMD64; + case Triple::arm: + return COFF::IMAGE_FILE_MACHINE_ARMNT; + case Triple::aarch64: + return COFF::IMAGE_FILE_MACHINE_ARM64; + default: + llvm::errs() << MB.getBufferIdentifier() + << ": unknown arch in target triple " << *TripleStr << '\n'; + exit(1); + } +} + +static void appendFile(std::vector &Members, + COFF::MachineTypes &LibMachine, + std::string &LibMachineSource, MemoryBufferRef MB) { + file_magic Magic = identify_magic(MB.getBuffer()); + + if (Magic != file_magic::coff_object && Magic != file_magic::bitcode && + Magic != file_magic::archive && Magic != file_magic::windows_resource) { + llvm::errs() << MB.getBufferIdentifier() + << ": not a COFF object, bitcode, archive or resource file\n"; + exit(1); + } + + // If a user attempts to add an archive to another archive, llvm-lib doesn't + // handle the first archive file as a single file. Instead, it extracts all + // members from the archive and add them to the second archive. This beahvior + // is for compatibility with Microsoft's lib command. + if (Magic == file_magic::archive) { + Error Err = Error::success(); + object::Archive Archive(MB, Err); + fatalOpenError(std::move(Err), MB.getBufferIdentifier()); + + for (auto &C : Archive.children(Err)) { + Expected ChildMB = C.getMemoryBufferRef(); + if (!ChildMB) { + handleAllErrors(ChildMB.takeError(), [&](const ErrorInfoBase &EIB) { + llvm::errs() << MB.getBufferIdentifier() << ": " << EIB.message() + << "\n"; + }); + exit(1); + } + + appendFile(Members, LibMachine, LibMachineSource, *ChildMB); + } + + fatalOpenError(std::move(Err), MB.getBufferIdentifier()); + return; + } + + // Check that all input files have the same machine type. + // Mixing normal objects and LTO bitcode files is fine as long as they + // have the same machine type. + // Doing this here duplicates the header parsing work that writeArchive() + // below does, but it's not a lot of work and it's a bit awkward to do + // in writeArchive() which needs to support many tools, can't assume the + // input is COFF, and doesn't have a good way to report errors. + if (Magic == file_magic::coff_object || Magic == file_magic::bitcode) { + COFF::MachineTypes FileMachine = (Magic == file_magic::coff_object) + ? getCOFFFileMachine(MB) + : getBitcodeFileMachine(MB); + + // FIXME: Once lld-link rejects multiple resource .obj files: + // Call convertResToCOFF() on .res files and add the resulting + // COFF file to the .lib output instead of adding the .res file, and remove + // this check. See PR42180. + if (FileMachine != COFF::IMAGE_FILE_MACHINE_UNKNOWN) { + if (LibMachine == COFF::IMAGE_FILE_MACHINE_UNKNOWN) { + LibMachine = FileMachine; + LibMachineSource = + (" (inferred from earlier file '" + MB.getBufferIdentifier() + "')") + .str(); + } else if (LibMachine != FileMachine) { + llvm::errs() << MB.getBufferIdentifier() << ": file machine type " + << machineToStr(FileMachine) + << " conflicts with library machine type " + << machineToStr(LibMachine) << LibMachineSource << '\n'; + exit(1); + } + } + } + + Members.emplace_back(MB); +} + int llvm::libDriverMain(ArrayRef ArgsArr) { BumpPtrAllocator Alloc; StringSaver Saver(Alloc); @@ -195,104 +315,40 @@ int llvm::libDriverMain(ArrayRef ArgsArr) { std::string(" (from '/machine:") + Arg->getValue() + "' flag)"; } - // Create a NewArchiveMember for each input file. + std::vector> MBs; + StringSet<> Seen; std::vector Members; + + // Create a NewArchiveMember for each input file. for (auto *Arg : Args.filtered(OPT_INPUT)) { + // Find a file std::string Path = findInputFile(Arg->getValue(), SearchPaths); if (Path.empty()) { llvm::errs() << Arg->getValue() << ": no such file or directory\n"; return 1; } - Expected MOrErr = - NewArchiveMember::getFile(Saver.save(Path), /*Deterministic=*/true); - if (!MOrErr) { - handleAllErrors(MOrErr.takeError(), [&](const ErrorInfoBase &EIB) { - llvm::errs() << Arg->getValue() << ": " << EIB.message() << "\n"; - }); - return 1; - } - - file_magic Magic = identify_magic(MOrErr->Buf->getBuffer()); - if (Magic != file_magic::coff_object && Magic != file_magic::bitcode && - Magic != file_magic::windows_resource) { - llvm::errs() << Arg->getValue() - << ": not a COFF object, bitcode or resource file\n"; - return 1; - } - - // Check that all input files have the same machine type. - // Mixing normal objects and LTO bitcode files is fine as long as they - // have the same machine type. - // Doing this here duplicates the header parsing work that writeArchive() - // below does, but it's not a lot of work and it's a bit awkward to do - // in writeArchive() which needs to support many tools, can't assume the - // input is COFF, and doesn't have a good way to report errors. - COFF::MachineTypes FileMachine = COFF::IMAGE_FILE_MACHINE_UNKNOWN; - if (Magic == file_magic::coff_object) { - std::error_code EC; - object::COFFObjectFile Obj(*MOrErr->Buf, EC); - if (EC) { - llvm::errs() << Arg->getValue() << ": failed to open: " << EC.message() - << '\n'; - return 1; - } - uint16_t Machine = Obj.getMachine(); - if (Machine != COFF::IMAGE_FILE_MACHINE_I386 && - Machine != COFF::IMAGE_FILE_MACHINE_AMD64 && - Machine != COFF::IMAGE_FILE_MACHINE_ARMNT && - Machine != COFF::IMAGE_FILE_MACHINE_ARM64) { - llvm::errs() << Arg->getValue() << ": unknown machine: " << Machine - << '\n'; - return 1; - } - FileMachine = static_cast(Machine); - } else if (Magic == file_magic::bitcode) { - Expected TripleStr = getBitcodeTargetTriple(*MOrErr->Buf); - if (!TripleStr) { - llvm::errs() << Arg->getValue() - << ": failed to get target triple from bitcode\n"; - return 1; - } - switch (Triple(*TripleStr).getArch()) { - case Triple::x86: - FileMachine = COFF::IMAGE_FILE_MACHINE_I386; - break; - case Triple::x86_64: - FileMachine = COFF::IMAGE_FILE_MACHINE_AMD64; - break; - case Triple::arm: - FileMachine = COFF::IMAGE_FILE_MACHINE_ARMNT; - break; - case Triple::aarch64: - FileMachine = COFF::IMAGE_FILE_MACHINE_ARM64; - break; - default: - llvm::errs() << Arg->getValue() << ": unknown arch in target triple " - << *TripleStr << '\n'; - return 1; - } - } - - // FIXME: Once lld-link rejects multiple resource .obj files: - // Call convertResToCOFF() on .res files and add the resulting - // COFF file to the .lib output instead of adding the .res file, and remove - // this check. See PR42180. - if (FileMachine != COFF::IMAGE_FILE_MACHINE_UNKNOWN) { - if (LibMachine == COFF::IMAGE_FILE_MACHINE_UNKNOWN) { - LibMachine = FileMachine; - LibMachineSource = std::string(" (inferred from earlier file '") + - Arg->getValue() + "')"; - } else if (LibMachine != FileMachine) { - llvm::errs() << Arg->getValue() << ": file machine type " - << machineToStr(FileMachine) - << " conflicts with library machine type " - << machineToStr(LibMachine) << LibMachineSource << '\n'; - return 1; - } - } - - Members.emplace_back(std::move(*MOrErr)); + // Input files are uniquified by pathname. If you specify the exact same + // path more than once, all but the first one are ignored. + // + // Note that there's a loophole in the rule; you can prepend `.\` or + // something like that to a path to make it look different, and they are + // handled as if they were different files. This behavior is compatible with + // Microsoft lib.exe. + if (!Seen.insert(Path).second) + continue; + + // Open a file. + ErrorOr> MOrErr = + MemoryBuffer::getFile(Path, -1, false); + fatalOpenError(errorCodeToError(MOrErr.getError()), Path); + MemoryBufferRef MBRef = (*MOrErr)->getMemBufferRef(); + + // Append a file. + appendFile(Members, LibMachine, LibMachineSource, MBRef); + + // Take the ownership of the file buffer to keep the file open. + MBs.push_back(std::move(*MOrErr)); } // Create an archive file. diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 1455a906103aa5..58ce91c807dd77 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -2467,7 +2467,7 @@ struct AAAlignImpl : AAAlign { if (SI->getAlignment() < getAssumedAlign()) { STATS_DECLTRACK(AAAlign, Store, "Number of times alignemnt added to a store"); - SI->setAlignment(getAssumedAlign()); + SI->setAlignment(Align(getAssumedAlign())); Changed = ChangeStatus::CHANGED; } } else if (auto *LI = dyn_cast(U.getUser())) { diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 9c7fd5e1a813f6..feac1b60884897 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2285,14 +2285,10 @@ OptimizeFunctions(Module &M, // So, remove unreachable blocks from the function, because a) there's // no point in analyzing them and b) GlobalOpt should otherwise grow // some more complicated logic to break these cycles. - // Removing unreachable blocks might invalidate the dominator so we - // recalculate it. if (!F->isDeclaration()) { - if (removeUnreachableBlocks(*F)) { - auto &DT = LookupDomTree(*F); - DT.recalculate(*F); - Changed = true; - } + auto &DT = LookupDomTree(*F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Changed |= removeUnreachableBlocks(*F, &DTU); } Changed |= processGlobal(*F, GetTLI, LookupDomTree); diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index 31571c4a20a87f..bd641da37f5523 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -628,11 +628,6 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { } while (!Region.empty()); } - // We need to explicitly clear the assumption cache since the value tracking - // may now be invalid as part of the function has changed. - if (Changed) - if (AssumptionCache *AC = LookupAC(F)) - AC->clear(); return Changed; } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 1b83ccb816e9a4..36562961744f79 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1887,6 +1887,17 @@ bool LowerTypeTestsModule::lower() { CfiFunctionLinkage Linkage = P.second.Linkage; MDNode *FuncMD = P.second.FuncMD; Function *F = M.getFunction(FunctionName); + if (F && F->hasLocalLinkage()) { + // Locally defined function that happens to have the same name as a + // function defined in a ThinLTO module. Rename it to move it out of + // the way of the external reference that we're about to create. + // Note that setName will find a unique name for the function, so even + // if there is an existing function with the suffix there won't be a + // name collision. + F->setName(F->getName() + ".1"); + F = nullptr; + } + if (!F) F = Function::Create( FunctionType::get(Type::getVoidTy(M.getContext()), false), diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp index 62f4584d5f58d0..a0f0b6726cc2b8 100644 --- a/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -1264,7 +1264,7 @@ std::pair PartialInlinerImpl::unswitchFunction(Function *F) { if (PSI->isFunctionEntryCold(F)) return {false, nullptr}; - if (empty(F->users())) + if (F->users().empty()) return {false, nullptr}; OptimizationRemarkEmitter ORE(F); @@ -1370,7 +1370,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { return false; } - assert(empty(Cloner.OrigFunc->users()) && + assert(Cloner.OrigFunc->users().empty() && "F's users should all be replaced!"); std::vector Users(Cloner.ClonedFunc->user_begin(), diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 4490a7ef62adf0..4055fe049999b1 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -713,7 +713,7 @@ void runWholeProgramDevirtOnIndex( void updateIndexWPDForExports( ModuleSummaryIndex &Summary, - StringMap &ExportLists, + function_ref isExported, std::map> &LocalWPDTargetsMap) { for (auto &T : LocalWPDTargetsMap) { auto &VI = T.first; @@ -721,9 +721,7 @@ void updateIndexWPDForExports( assert(VI.getSummaryList().size() == 1 && "Devirt of local target has more than one copy"); auto &S = VI.getSummaryList()[0]; - const auto &ExportList = ExportLists.find(S->modulePath()); - if (ExportList == ExportLists.end() || - !ExportList->second.count(VI.getGUID())) + if (!isExported(S->modulePath(), VI.getGUID())) continue; // It's been exported by a cross module import. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index 2cfd3f5bb17f6c..825f4b468b0a7f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -124,7 +124,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) { auto *SI = new StoreInst(RMWI.getValOperand(), RMWI.getPointerOperand(), &RMWI); SI->setAtomic(Ordering, RMWI.getSyncScopeID()); - SI->setAlignment(DL.getABITypeAlignment(RMWI.getType())); + SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType()))); return eraseInstFromFunction(RMWI); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index bc458ebf65212d..8d4b0dc0a7a718 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -185,7 +185,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); LoadInst *L = Builder.CreateLoad(IntType, Src); // Alignment from the mem intrinsic will be better, so use it. - L->setAlignment(MaybeAlign(CopySrcAlign)); + L->setAlignment( + MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) L->setMetadata(LLVMContext::MD_tbaa, CopyMD); MDNode *LoopMemParallelMD = @@ -198,7 +199,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { StoreInst *S = Builder.CreateStore(L, Dest); // Alignment from the mem intrinsic will be better, so use it. - S->setAlignment(CopyDstAlign); + S->setAlignment( + MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) S->setMetadata(LLVMContext::MD_tbaa, CopyMD); if (LoopMemParallelMD) @@ -223,9 +225,10 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { } Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); - if (MI->getDestAlignment() < Alignment) { - MI->setDestAlignment(Alignment); + const unsigned KnownAlignment = + getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); + if (MI->getDestAlignment() < KnownAlignment) { + MI->setDestAlignment(KnownAlignment); return MI; } @@ -243,13 +246,9 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { ConstantInt *FillC = dyn_cast(MI->getValue()); if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) return nullptr; - uint64_t Len = LenC->getLimitedValue(); - Alignment = MI->getDestAlignment(); + const uint64_t Len = LenC->getLimitedValue(); assert(Len && "0-sized memory setting should be removed already."); - - // Alignment 0 is identity for alignment 1 for memset, but not store. - if (Alignment == 0) - Alignment = 1; + const Align Alignment = assumeAligned(MI->getDestAlignment()); // If it is an atomic and alignment is less than the size then we will // introduce the unaligned memory access which will be later transformed diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 74c69808f15856..c58e63d08e31ca 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2338,8 +2338,15 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // If we found a path from the src to dest, create the getelementptr now. if (SrcElTy == DstElTy) { SmallVector Idxs(NumZeros + 1, Builder.getInt32(0)); - return GetElementPtrInst::CreateInBounds(SrcPTy->getElementType(), Src, - Idxs); + GetElementPtrInst *GEP = + GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs); + + // If the source pointer is dereferenceable, then assume it points to an + // allocated object and apply "inbounds" to the GEP. + bool CanBeNull; + if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) + GEP->setIsInBounds(); + return GEP; } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index ddc7de39d8d2aa..f07f64e3f02ea5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1384,6 +1384,29 @@ Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) { return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero); } +/// Fold equality-comparison between zero and any (maybe truncated) right-shift +/// by one-less-than-bitwidth into a sign test on the original value. +Instruction *foldSignBitTest(ICmpInst &I) { + ICmpInst::Predicate Pred; + Value *X; + Constant *C; + if (!I.isEquality() || + !match(&I, m_ICmp(Pred, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))), + m_Zero()))) + return nullptr; + + Type *XTy = X->getType(); + unsigned XBitWidth = XTy->getScalarSizeInBits(); + if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(XBitWidth, XBitWidth - 1)))) + return nullptr; + + return ICmpInst::Create(Instruction::ICmp, + Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE + : ICmpInst::ICMP_SLT, + X, ConstantInt::getNullValue(XTy)); +} + // Handle icmp pred X, 0 Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) { CmpInst::Predicate Pred = Cmp.getPredicate(); @@ -5449,6 +5472,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpInstWithConstant(I)) return Res; + // Try to match comparison as a sign bit test. Intentionally do this after + // foldICmpInstWithConstant() to potentially let other folds to happen first. + if (Instruction *New = foldSignBitTest(I)) + return New; + if (Instruction *Res = foldICmpInstWithConstantNotInt(I)) return Res; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 673099436b7942..dcdbee15fe56b2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -351,6 +351,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner Instruction *visitOr(BinaryOperator &I); Instruction *visitXor(BinaryOperator &I); Instruction *visitShl(BinaryOperator &I); + Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr); Instruction *visitAShr(BinaryOperator &I); Instruction *visitLShr(BinaryOperator &I); Instruction *commonShiftTransforms(BinaryOperator &I); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index eb01b4b7d7d150..4c5e1cc43760a7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1356,15 +1356,15 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return eraseInstFromFunction(SI); // Attempt to improve the alignment. - unsigned KnownAlign = getOrEnforceKnownAlignment( - Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT); - unsigned StoreAlign = SI.getAlignment(); - unsigned EffectiveStoreAlign = - StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType()); + const Align KnownAlign = Align(getOrEnforceKnownAlignment( + Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT)); + const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment()); + const Align EffectiveStoreAlign = + StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType())); if (KnownAlign > EffectiveStoreAlign) SI.setAlignment(KnownAlign); - else if (StoreAlign == 0) + else if (!StoreAlign) SI.setAlignment(EffectiveStoreAlign); // Try to canonicalize the stored type. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index a301d9eef60e80..0b9128a9f5a1c2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -124,6 +124,50 @@ static Constant *getLogBase2(Type *Ty, Constant *C) { return ConstantVector::get(Elts); } +// TODO: This is a specific form of a much more general pattern. +// We could detect a select with any binop identity constant, or we +// could use SimplifyBinOp to see if either arm of the select reduces. +// But that needs to be done carefully and/or while removing potential +// reverse canonicalizations as in InstCombiner::foldSelectIntoOp(). +static Value *foldMulSelectToNegate(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *Cond, *OtherOp; + + // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp + // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp + if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())), + m_Value(OtherOp)))) + return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp)); + + // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp + // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp + if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())), + m_Value(OtherOp)))) + return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp); + + // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp + // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp + if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0), + m_SpecificFP(-1.0))), + m_Value(OtherOp)))) { + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(I.getFastMathFlags()); + return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp)); + } + + // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp + // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp + if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0), + m_SpecificFP(1.0))), + m_Value(OtherOp)))) { + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(I.getFastMathFlags()); + return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp); + } + + return nullptr; +} + Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) @@ -213,24 +257,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) return FoldedMul; - // TODO: This is a specific form of a much more general pattern. - // We could detect a select with any binop identity constant, or we - // could use SimplifyBinOp to see if either arm of the select reduces. - // But that needs to be done carefully and/or while removing potential - // reverse canonicalizations as in InstCombiner::foldSelectIntoOp(). - // mul (select Cond, 1, -1), Op1 --> select Cond, Op1, -Op1 - // mul (select Cond, -1, 1), Op1 --> select Cond, -Op1, Op1 - // mul Op0, (select Cond, 1, -1) --> select Cond, Op0, -Op0 - // mul Op0, (select Cond, -1, 1) --> select Cond, -Op0, Op0 - Value *Cond; - if (match(Op0, m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())))) - return SelectInst::Create(Cond, Op1, Builder.CreateNeg(Op1)); - if (match(Op0, m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())))) - return SelectInst::Create(Cond, Builder.CreateNeg(Op1), Op1); - if (match(Op1, m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())))) - return SelectInst::Create(Cond, Op0, Builder.CreateNeg(Op0)); - if (match(Op1, m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())))) - return SelectInst::Create(Cond, Builder.CreateNeg(Op0), Op0); + if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) + return replaceInstUsesWith(I, FoldedMul); // Simplify mul instructions with a constant RHS. if (isa(Op1)) { @@ -377,6 +405,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) return FoldedMul; + if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) + return replaceInstUsesWith(I, FoldedMul); + // X * -1.0 --> -X Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); if (match(Op1, m_SpecificFP(-1.0))) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index bc4affbecdfa37..8ab4aeb38beaa6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -61,16 +61,10 @@ reassociateShiftAmtsOfTwoSameDirectionShifts(BinaryOperator *Sh0, if (ShiftOpcode != Sh1->getOpcode()) return nullptr; - // Did we match a pattern with truncation ? - if (Trunc) { - // For right-shifts we can't do any such simplifications. Leave as-is. - if (ShiftOpcode != Instruction::BinaryOps::Shl) - return nullptr; // FIXME: still could perform constant-folding. - // If we saw truncation, we'll need to produce extra instruction, - // and for that one of the operands of the shift must be one-use. - if (!match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) - return nullptr; - } + // If we saw truncation, we'll need to produce extra instruction, + // and for that one of the operands of the shift must be one-use. + if (Trunc && !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; // Can we fold (ShAmt0+ShAmt1) ? auto *NewShAmt = dyn_cast_or_null( @@ -78,13 +72,23 @@ reassociateShiftAmtsOfTwoSameDirectionShifts(BinaryOperator *Sh0, SQ.getWithInstruction(Sh0))); if (!NewShAmt) return nullptr; // Did not simplify. - // Is the new shift amount smaller than the bit width of inner shift? - if (!match(NewShAmt, m_SpecificInt_ICMP( - ICmpInst::Predicate::ICMP_ULT, - APInt(NewShAmt->getType()->getScalarSizeInBits(), - X->getType()->getScalarSizeInBits())))) + unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits(); + unsigned XBitWidth = X->getType()->getScalarSizeInBits(); + // Is the new shift amount smaller than the bit width of inner/new shift? + if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt(NewShAmtBitWidth, XBitWidth)))) return nullptr; // FIXME: could perform constant-folding. + // If there was a truncation, and we have a right-shift, we can only fold if + // we are left with the original sign bit. + // FIXME: zero shift amount is also legal here, but we can't *easily* check + // more than one predicate so it's not really worth it. + if (Trunc && ShiftOpcode != Instruction::BinaryOps::Shl && + !match(NewShAmt, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(NewShAmtBitWidth, XBitWidth - 1)))) + return nullptr; + // All good, we can do this fold. NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType()); @@ -1039,6 +1043,75 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { return nullptr; } +Instruction * +InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr) { + assert(OldAShr.getOpcode() == Instruction::AShr && + "Must be called with arithmetic right-shift instruction only."); + + // Check that constant C is a splat of the element-wise bitwidth of V. + auto BitWidthSplat = [](Constant *C, Value *V) { + return match( + C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(C->getType()->getScalarSizeInBits(), + V->getType()->getScalarSizeInBits()))); + }; + + // It should look like variable-length sign-extension on the outside: + // (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits) + Value *NBits; + Instruction *MaybeTrunc; + Constant *C1, *C2; + if (!match(&OldAShr, + m_AShr(m_Shl(m_Instruction(MaybeTrunc), + m_ZExtOrSelf(m_Sub(m_Constant(C1), + m_ZExtOrSelf(m_Value(NBits))))), + m_ZExtOrSelf(m_Sub(m_Constant(C2), + m_ZExtOrSelf(m_Deferred(NBits)))))) || + !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr)) + return nullptr; + + // There may or may not be a truncation after outer two shifts. + Instruction *HighBitExtract; + match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract))); + bool HadTrunc = MaybeTrunc != HighBitExtract; + + // And finally, the innermost part of the pattern must be a right-shift. + Value *X, *NumLowBitsToSkip; + if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip)))) + return nullptr; + + // Said right-shift must extract high NBits bits - C0 must be it's bitwidth. + Constant *C0; + if (!match(NumLowBitsToSkip, + m_ZExtOrSelf( + m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) || + !BitWidthSplat(C0, HighBitExtract)) + return nullptr; + + // Since the NBits is identical for all shifts, if the outermost and + // innermost shifts are identical, then outermost shifts are redundant. + // If we had truncation, do keep it though. + if (HighBitExtract->getOpcode() == OldAShr.getOpcode()) + return replaceInstUsesWith(OldAShr, MaybeTrunc); + + // Else, if there was a truncation, then we need to ensure that one + // instruction will go away. + if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Finally, bypass two innermost shifts, and perform the outermost shift on + // the operands of the innermost shift. + Instruction *NewAShr = + BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip); + NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness. + if (!HadTrunc) + return NewAShr; + + Builder.Insert(NewAShr); + return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType()); +} + Instruction *InstCombiner::visitAShr(BinaryOperator &I) { if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), SQ.getWithInstruction(&I))) @@ -1113,6 +1186,9 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { } } + if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I)) + return R; + // See if we can turn a signed shr into an unsigned shr. if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I)) return BinaryOperator::CreateLShr(Op0, Op1); diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 78b697f7f94036..22190ad7a0ae98 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -14,6 +14,7 @@ add_llvm_library(LLVMInstrumentation PGOMemOPSizeOpt.cpp PoisonChecking.cpp SanitizerCoverage.cpp + ValueProfileCollector.cpp ThreadSanitizer.cpp HWAddressSanitizer.cpp diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 81bd2f3c18ac96..f9354069da3261 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2562,6 +2562,11 @@ struct MemorySanitizerVisitor : public InstVisitor { return false; } + void handleInvariantGroup(IntrinsicInst &I) { + setShadow(&I, getShadow(&I, 0)); + setOrigin(&I, getOrigin(&I, 0)); + } + void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; @@ -2993,6 +2998,10 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::lifetime_start: handleLifetimeStart(I); break; + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + handleInvariantGroup(I); + break; case Intrinsic::bswap: handleBswap(I); break; diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index e776d59cccb5ba..3862f19ab7abed 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -48,6 +48,7 @@ //===----------------------------------------------------------------------===// #include "CFGMST.h" +#include "ValueProfileCollector.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -61,7 +62,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/IndirectCallVisitor.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -121,6 +121,7 @@ using namespace llvm; using ProfileCount = Function::ProfileCount; +using VPCandidateInfo = ValueProfileCollector::CandidateInfo; #define DEBUG_TYPE "pgo-instrumentation" @@ -287,6 +288,11 @@ static std::string getBranchCondString(Instruction *TI) { return result; } +static const char *ValueProfKindDescr[] = { +#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, +#include "llvm/ProfileData/InstrProfData.inc" +}; + namespace { /// The select instruction visitor plays three roles specified @@ -349,50 +355,6 @@ struct SelectInstVisitor : public InstVisitor { unsigned getNumOfSelectInsts() const { return NSIs; } }; -/// Instruction Visitor class to visit memory intrinsic calls. -struct MemIntrinsicVisitor : public InstVisitor { - Function &F; - unsigned NMemIs = 0; // Number of memIntrinsics instrumented. - VisitMode Mode = VM_counting; // Visiting mode. - unsigned CurCtrId = 0; // Current counter index. - unsigned TotalNumCtrs = 0; // Total number of counters - GlobalVariable *FuncNameVar = nullptr; - uint64_t FuncHash = 0; - PGOUseFunc *UseFunc = nullptr; - std::vector Candidates; - - MemIntrinsicVisitor(Function &Func) : F(Func) {} - - void countMemIntrinsics(Function &Func) { - NMemIs = 0; - Mode = VM_counting; - visit(Func); - } - - void instrumentMemIntrinsics(Function &Func, unsigned TotalNC, - GlobalVariable *FNV, uint64_t FHash) { - Mode = VM_instrument; - TotalNumCtrs = TotalNC; - FuncHash = FHash; - FuncNameVar = FNV; - visit(Func); - } - - std::vector findMemIntrinsics(Function &Func) { - Candidates.clear(); - Mode = VM_annotate; - visit(Func); - return Candidates; - } - - // Visit the IR stream and annotate all mem intrinsic call instructions. - void instrumentOneMemIntrinsic(MemIntrinsic &MI); - - // Visit \p MI instruction and perform tasks according to visit mode. - void visitMemIntrinsic(MemIntrinsic &SI); - - unsigned getNumOfMemIntrinsics() const { return NMemIs; } -}; class PGOInstrumentationGenLegacyPass : public ModulePass { public: @@ -564,13 +526,14 @@ template class FuncPGOInstrumentation { // A map that stores the Comdat group in function F. std::unordered_multimap &ComdatMembers; + ValueProfileCollector VPC; + void computeCFGHash(); void renameComdatFunction(); public: - std::vector> ValueSites; + std::vector> ValueSites; SelectInstVisitor SIVisitor; - MemIntrinsicVisitor MIVisitor; std::string FuncName; GlobalVariable *FuncNameVar; @@ -605,23 +568,21 @@ template class FuncPGOInstrumentation { std::unordered_multimap &ComdatMembers, bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr, bool IsCS = false) - : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), - ValueSites(IPVK_Last + 1), SIVisitor(Func), MIVisitor(Func), - MST(F, BPI, BFI) { + : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func), + ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) { // This should be done before CFG hash computation. SIVisitor.countSelects(Func); - MIVisitor.countMemIntrinsics(Func); + ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize); if (!IsCS) { NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfPGOBB += MST.BBInfos.size(); - ValueSites[IPVK_IndirectCallTarget] = findIndirectCalls(Func); + ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget); } else { NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfCSPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfCSPGOBB += MST.BBInfos.size(); } - ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func); FuncName = getPGOFuncName(F); computeCFGHash(); @@ -875,28 +836,36 @@ static void instrumentOneFunc( if (DisableValueProfiling) return; - unsigned NumIndirectCalls = 0; - for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) { - CallSite CS(I); - Value *Callee = CS.getCalledValue(); - LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = " - << NumIndirectCalls << "\n"); - IRBuilder<> Builder(I); - assert(Builder.GetInsertPoint() != I->getParent()->end() && - "Cannot get the Instrumentation point"); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), - Builder.getInt64(FuncInfo.FunctionHash), - Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()), - Builder.getInt32(IPVK_IndirectCallTarget), - Builder.getInt32(NumIndirectCalls++)}); - } - NumOfPGOICall += NumIndirectCalls; + NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size(); + + // For each VP Kind, walk the VP candidates and instrument each one. + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) { + unsigned SiteIndex = 0; + if (Kind == IPVK_MemOPSize && !PGOInstrMemOP) + continue; - // Now instrument memop intrinsic calls. - FuncInfo.MIVisitor.instrumentMemIntrinsics( - F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash); + for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) { + LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind] + << " site: CallSite Index = " << SiteIndex << "\n"); + + IRBuilder<> Builder(Cand.InsertPt); + assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() && + "Cannot get the Instrumentation point"); + + Value *ToProfile = nullptr; + if (Cand.V->getType()->isIntegerTy()) + ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty()); + else if (Cand.V->getType()->isPointerTy()) + ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); + assert(ToProfile && "value profiling Value is of unexpected type"); + + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), + {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), + Builder.getInt64(FuncInfo.FunctionHash), ToProfile, + Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}); + } + } // IPVK_First <= Kind <= IPVK_Last } namespace { @@ -1429,43 +1398,6 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) { llvm_unreachable("Unknown visiting mode"); } -void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) { - Module *M = F.getParent(); - IRBuilder<> Builder(&MI); - Type *Int64Ty = Builder.getInt64Ty(); - Type *I8PtrTy = Builder.getInt8PtrTy(); - Value *Length = MI.getLength(); - assert(!isa(Length)); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), - Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty), - Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)}); - ++CurCtrId; -} - -void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) { - if (!PGOInstrMemOP) - return; - Value *Length = MI.getLength(); - // Not instrument constant length calls. - if (dyn_cast(Length)) - return; - - switch (Mode) { - case VM_counting: - NMemIs++; - return; - case VM_instrument: - instrumentOneMemIntrinsic(MI); - return; - case VM_annotate: - Candidates.push_back(&MI); - return; - } - llvm_unreachable("Unknown visiting mode"); -} - // Traverse all valuesites and annotate the instructions for all value kind. void PGOUseFunc::annotateValueSites() { if (DisableValueProfiling) @@ -1478,11 +1410,6 @@ void PGOUseFunc::annotateValueSites() { annotateValueSites(Kind); } -static const char *ValueProfKindDescr[] = { -#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, -#include "llvm/ProfileData/InstrProfData.inc" -}; - // Annotate the instructions for a specific value kind. void PGOUseFunc::annotateValueSites(uint32_t Kind) { assert(Kind <= IPVK_Last); @@ -1501,11 +1428,11 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) { return; } - for (auto &I : ValueSites) { + for (VPCandidateInfo &I : ValueSites) { LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind << "): Index = " << ValueSiteIndex << " out of " << NumValueSites << "\n"); - annotateValueSite(*M, *I, ProfileRecord, + annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord, static_cast(Kind), ValueSiteIndex, Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations : MaxNumAnnotations); diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp new file mode 100644 index 00000000000000..604726d4f40fcc --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp @@ -0,0 +1,78 @@ +//===- ValueProfileCollector.cpp - determine what to value profile --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl +// +//===----------------------------------------------------------------------===// + +#include "ValueProfilePlugins.inc" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" + +#include + +using namespace llvm; + +namespace { + +/// A plugin-based class that takes an arbitrary number of Plugin types. +/// Each plugin type must satisfy the following API: +/// 1) the constructor must take a `Function &f`. Typically, the plugin would +/// scan the function looking for candidates. +/// 2) contain a member function with the following signature and name: +/// void run(std::vector &Candidates); +/// such that the plugin would append its result into the vector parameter. +/// +/// Plugins are defined in ValueProfilePlugins.inc +template class PluginChain; + +/// The type PluginChainFinal is the final chain of plugins that will be used by +/// ValueProfileCollectorImpl. +using PluginChainFinal = PluginChain; + +template <> class PluginChain<> { +public: + PluginChain(Function &F) {} + void get(InstrProfValueKind K, std::vector &Candidates) {} +}; + +template +class PluginChain : public PluginChain { + PluginT Plugin; + using Base = PluginChain; + +public: + PluginChain(Function &F) : PluginChain(F), Plugin(F) {} + + void get(InstrProfValueKind K, std::vector &Candidates) { + if (K == PluginT::Kind) + Plugin.run(Candidates); + Base::get(K, Candidates); + } +}; + +} // end anonymous namespace + +/// ValueProfileCollectorImpl inherits the API of PluginChainFinal. +class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal { +public: + using PluginChainFinal::PluginChainFinal; +}; + +ValueProfileCollector::ValueProfileCollector(Function &F) + : PImpl(new ValueProfileCollectorImpl(F)) {} + +ValueProfileCollector::~ValueProfileCollector() = default; + +std::vector +ValueProfileCollector::get(InstrProfValueKind Kind) const { + std::vector Result; + PImpl->get(Kind, Result); + return Result; +} diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h new file mode 100644 index 00000000000000..ff883c8d0c7798 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h @@ -0,0 +1,79 @@ +//===- ValueProfileCollector.h - determine what to value profile ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a utility class, ValueProfileCollector, that is used to +// determine what kind of llvm::Value's are worth value-profiling, at which +// point in the program, and which instruction holds the Value Profile metadata. +// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use] +// passes. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H +#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" + +namespace llvm { + +/// Utility analysis that determines what values are worth profiling. +/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to +/// populate the Candidates vector. +/// +/// Value profiling an expression means to track the values that this expression +/// takes at runtime and the frequency of each value. +/// It is important to distinguish between two sets of value profiles for a +/// particular expression: +/// 1) The set of values at the point of evaluation. +/// 2) The set of values at the point of use. +/// In some cases, the two sets are identical, but it's not unusual for the two +/// to differ. +/// +/// To elaborate more, consider this C code, and focus on the expression `nn`: +/// void foo(int nn, bool b) { +/// if (b) memcpy(x, y, nn); +/// } +/// The point of evaluation can be as early as the start of the function, and +/// let's say the value profile for `nn` is: +/// total=100; (value,freq) set = {(8,10), (32,50)} +/// The point of use is right before we call memcpy, and since we execute the +/// memcpy conditionally, the value profile of `nn` can be: +/// total=15; (value,freq) set = {(8,10), (4,5)} +/// +/// For this reason, a plugin is responsible for computing the insertion point +/// for each value to be profiled. The `CandidateInfo` structure encapsulates +/// all the information needed for each value profile site. +class ValueProfileCollector { +public: + struct CandidateInfo { + Value *V; // The value to profile. + Instruction *InsertPt; // Insert the VP lib call before this instr. + Instruction *AnnotatedInst; // Where metadata is attached. + }; + + ValueProfileCollector(Function &Fn); + ValueProfileCollector(ValueProfileCollector &&) = delete; + ValueProfileCollector &operator=(ValueProfileCollector &&) = delete; + + ValueProfileCollector(const ValueProfileCollector &) = delete; + ValueProfileCollector &operator=(const ValueProfileCollector &) = delete; + ~ValueProfileCollector(); + + /// returns a list of value profiling candidates of the given kind + std::vector get(InstrProfValueKind Kind) const; + +private: + class ValueProfileCollectorImpl; + std::unique_ptr PImpl; +}; + +} // namespace llvm + +#endif diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc new file mode 100644 index 00000000000000..4cc4c6c848c39a --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc @@ -0,0 +1,75 @@ +//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a set of plugin classes used in ValueProfileCollectorImpl. +// Each plugin is responsible for collecting Value Profiling candidates for a +// particular optimization. +// Each plugin must satisfy the interface described in ValueProfileCollector.cpp +// +//===----------------------------------------------------------------------===// + +#include "ValueProfileCollector.h" +#include "llvm/Analysis/IndirectCallVisitor.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; +using CandidateInfo = ValueProfileCollector::CandidateInfo; + +///--------------------------- MemIntrinsicPlugin ------------------------------ +class MemIntrinsicPlugin : public InstVisitor { + Function &F; + std::vector *Candidates; + +public: + static constexpr InstrProfValueKind Kind = IPVK_MemOPSize; + + MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {} + + void run(std::vector &Cs) { + Candidates = &Cs; + visit(F); + Candidates = nullptr; + } + void visitMemIntrinsic(MemIntrinsic &MI) { + Value *Length = MI.getLength(); + // Not instrument constant length calls. + if (dyn_cast(Length)) + return; + + Instruction *InsertPt = &MI; + Instruction *AnnotatedInst = &MI; + Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); + } +}; + +///------------------------ IndirectCallPromotionPlugin ------------------------ +class IndirectCallPromotionPlugin { + Function &F; + +public: + static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget; + + IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {} + + void run(std::vector &Candidates) { + std::vector Result = findIndirectCalls(F); + for (Instruction *I : Result) { + Value *Callee = CallSite(I).getCalledValue(); + Instruction *InsertPt = I; + Instruction *AnnotatedInst = I; + Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst}); + } + } +}; + +///----------------------- Registration of the plugins ------------------------- +/// For now, registering a plugin with the ValueProfileCollector is done by +/// adding the plugin type to the VP_PLUGIN_LIST macro. +#define VP_PLUGIN_LIST \ + MemIntrinsicPlugin, \ + IndirectCallPromotionPlugin diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 2d135b41279f60..0e9f03a060611b 100644 --- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -329,7 +329,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { SI->getPointerOperand(), SE); if (NewAlignment > SI->getAlignment()) { - SI->setAlignment(NewAlignment); + SI->setAlignment(MaybeAlign(NewAlignment)); ++NumStoreAlignChanged; } } else if (MemIntrinsic *MI = dyn_cast(J)) { diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp index 79b901ac0db8fe..1f01ba2fbfc6c3 100644 --- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -894,8 +894,8 @@ class GVNHoist { ++NumLoadsRemoved; } else if (auto *ReplacementStore = dyn_cast(Repl)) { ReplacementStore->setAlignment( - std::min(ReplacementStore->getAlignment(), - cast(I)->getAlignment())); + MaybeAlign(std::min(ReplacementStore->getAlignment(), + cast(I)->getAlignment()))); ++NumStoresRemoved; } else if (auto *ReplacementAlloca = dyn_cast(Repl)) { ReplacementAlloca->setAlignment( diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index a9fdfbaef3f4c3..1aaa0265bade63 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -2789,7 +2789,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { // have already been removed; TODO: generalize BasicBlock *ExitBlock = BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); - if (!empty(ExitBlock->phis())) + if (!ExitBlock->phis().empty()) return true; const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index f099b4d20c4ef2..262d64f1618592 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1248,12 +1248,22 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // FIXME: More precise: no Uses that alias SI. if (!Flags->IsSink && !MSSA->dominates(SIMD, MU)) return false; - } else if (const auto *MD = dyn_cast(&MA)) + } else if (const auto *MD = dyn_cast(&MA)) { if (auto *LI = dyn_cast(MD->getMemoryInst())) { (void)LI; // Silence warning. assert(!LI->isUnordered() && "Expected unordered load"); return false; } + // Any call, while it may not be clobbering SI, it may be a use. + if (auto *CI = dyn_cast(MD->getMemoryInst())) { + // Check if the call may read from the memory locattion written + // to by SI. Check CI's attributes and arguments; the number of + // such checks performed is limited above by NoOfMemAccTooLarge. + ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI)); + if (isModOrRefSet(MRI)) + return false; + } + } } auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI); @@ -1383,8 +1393,7 @@ static Instruction *CloneInstructionInExitBlock( if (!I.getName().empty()) New->setName(I.getName() + ".le"); - MemoryAccess *OldMemAcc; - if (MSSAU && (OldMemAcc = MSSAU->getMemorySSA()->getMemoryAccess(&I))) { + if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) { // Create a new MemoryAccess and let MemorySSA set its defining access. MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( New, nullptr, New->getParent(), MemorySSA::Beginning); @@ -1791,7 +1800,7 @@ class LoopPromoter : public LoadAndStorePromoter { StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); if (UnorderedAtomic) NewSI->setOrdering(AtomicOrdering::Unordered); - NewSI->setAlignment(Alignment); + NewSI->setAlignment(MaybeAlign(Alignment)); NewSI->setDebugLoc(DL); if (AATags) NewSI->setAAMetadata(AATags); @@ -2115,9 +2124,8 @@ bool llvm::promoteLoopAccessesToScalars( PreheaderLoad->setAAMetadata(AATags); SSA.AddAvailableValue(Preheader, PreheaderLoad); - MemoryAccess *PreheaderLoadMemoryAccess; if (MSSAU) { - PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( + MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); MemoryUse *NewMemUse = cast(PreheaderLoadMemoryAccess); MSSAU->insertUse(NewMemUse, /*RenameUses=*/true); diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index cdb1d79066773f..d85f20b3f80c10 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -161,7 +161,7 @@ static void handlePhiDef(CallInst *Expect) { return Result; }; - auto *PhiDef = dyn_cast(V); + auto *PhiDef = cast(V); // Get the first dominating conditional branch of the operand // i's incoming block. diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 91c879097afc23..c37da39b70b61d 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -1754,7 +1754,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef PHIOps, return true; }); // If we are left with no operands, it's dead. - if (empty(Filtered)) { + if (Filtered.empty()) { // If it has undef at this point, it means there are no-non-undef arguments, // and thus, the value of the phi node must be undef. if (HasUndef) { diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 47d226f83a4262..48bbdd8d1b33fe 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -2530,7 +2530,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, // statepoints surviving this pass. This makes testing easier and the // resulting IR less confusing to human readers. DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - bool MadeChange = removeUnreachableBlocks(F, nullptr, &DTU); + bool MadeChange = removeUnreachableBlocks(F, &DTU); // Flush the Dominator Tree. DTU.getDomTree(); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index cec65ba76edafe..c1e935fda7f860 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -3127,7 +3127,7 @@ class llvm::sroa::AllocaSliceRewriter Value *Op = SI->getOperand(0); StoreAlign = DL.getABITypeAlignment(Op->getType()); } - SI->setAlignment(std::min(StoreAlign, getSliceAlign())); + SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign()))); continue; } diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index fa6d3f8ae87380..1fe520b161020f 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -293,10 +293,8 @@ static BasicBlock *getCommonExitBlock(const SetVector &Blocks) { CommonExitBlock = Succ; continue; } - if (CommonExitBlock == Succ) - continue; - - return true; + if (CommonExitBlock != Succ) + return true; } return false; }; @@ -537,15 +535,41 @@ void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands, } } +bool CodeExtractor::isEligible() const { + if (Blocks.empty()) + return false; + BasicBlock *Header = *Blocks.begin(); + Function *F = Header->getParent(); + + // For functions with varargs, check that varargs handling is only done in the + // outlined function, i.e vastart and vaend are only used in outlined blocks. + if (AllowVarArgs && F->getFunctionType()->isVarArg()) { + auto containsVarArgIntrinsic = [](const Instruction &I) { + if (const CallInst *CI = dyn_cast(&I)) + if (const Function *Callee = CI->getCalledFunction()) + return Callee->getIntrinsicID() == Intrinsic::vastart || + Callee->getIntrinsicID() == Intrinsic::vaend; + return false; + }; + + for (auto &BB : *F) { + if (Blocks.count(&BB)) + continue; + if (llvm::any_of(BB, containsVarArgIntrinsic)) + return false; + } + } + return true; +} + void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &SinkCands) const { for (BasicBlock *BB : Blocks) { // If a used value is defined outside the region, it's an input. If an // instruction is used outside the region, it's an output. for (Instruction &II : *BB) { - for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; - ++OI) { - Value *V = *OI; + for (auto &OI : II.operands()) { + Value *V = OI; if (!SinkCands.count(V) && definedInCaller(Blocks, V)) Inputs.insert(V); } @@ -1277,13 +1301,6 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) { // Insert this basic block into the new function newBlocks.push_back(Block); - - // Remove @llvm.assume calls that were moved to the new function from the - // old function's assumption cache. - if (AC) - for (auto &I : *Block) - if (match(&I, m_Intrinsic())) - AC->unregisterAssumption(cast(&I)); } } @@ -1341,27 +1358,6 @@ Function *CodeExtractor::extractCodeRegion() { BasicBlock *header = *Blocks.begin(); Function *oldFunction = header->getParent(); - // For functions with varargs, check that varargs handling is only done in the - // outlined function, i.e vastart and vaend are only used in outlined blocks. - if (AllowVarArgs && oldFunction->getFunctionType()->isVarArg()) { - auto containsVarArgIntrinsic = [](Instruction &I) { - if (const CallInst *CI = dyn_cast(&I)) - if (const Function *F = CI->getCalledFunction()) - return F->getIntrinsicID() == Intrinsic::vastart || - F->getIntrinsicID() == Intrinsic::vaend; - return false; - }; - - for (auto &BB : *oldFunction) { - if (Blocks.count(&BB)) - continue; - if (llvm::any_of(BB, containsVarArgIntrinsic)) - return nullptr; - } - } - ValueSet inputs, outputs, SinkingCands, HoistingCands; - BasicBlock *CommonExit = nullptr; - // Calculate the entry frequency of the new function before we change the root // block. BlockFrequency EntryFreq; @@ -1375,6 +1371,15 @@ Function *CodeExtractor::extractCodeRegion() { } } + if (AC) { + // Remove @llvm.assume calls that were moved to the new function from the + // old function's assumption cache. + for (BasicBlock *Block : Blocks) + for (auto &I : *Block) + if (match(&I, m_Intrinsic())) + AC->unregisterAssumption(cast(&I)); + } + // If we have any return instructions in the region, split those blocks so // that the return is not in the region. splitReturnBlocks(); @@ -1428,6 +1433,8 @@ Function *CodeExtractor::extractCodeRegion() { } newFuncRoot->getInstList().push_back(BranchI); + ValueSet inputs, outputs, SinkingCands, HoistingCands; + BasicBlock *CommonExit = nullptr; findAllocas(SinkingCands, HoistingCands, CommonExit); assert(HoistingCands.empty() || CommonExit); @@ -1563,5 +1570,17 @@ Function *CodeExtractor::extractCodeRegion() { }); LLVM_DEBUG(if (verifyFunction(*oldFunction)) report_fatal_error("verification of oldFunction failed!")); + LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, AC)) + report_fatal_error("Stale Asumption cache for old Function!")); return newFunction; } + +bool CodeExtractor::verifyAssumptionCache(const Function& F, + AssumptionCache *AC) { + for (auto AssumeVH : AC->assumptions()) { + CallInst *I = cast(AssumeVH); + if (I->getFunction() != &F) + return true; + } + return false; +} diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 2285b6c822a481..94339c2ba00f9a 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -2210,12 +2210,10 @@ void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) { /// removeUnreachableBlocks - Remove blocks that are not reachable, even /// if they are in a dead cycle. Return true if a change was made, false -/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo -/// after modifying the CFG. -bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, - DomTreeUpdater *DTU, +/// otherwise. +bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, MemorySSAUpdater *MSSAU) { - SmallPtrSet Reachable; + SmallPtrSet Reachable; bool Changed = markAliveBlocks(F, Reachable, DTU); // If there are unreachable blocks in the CFG... @@ -2223,21 +2221,21 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, return Changed; assert(Reachable.size() < F.size()); - NumRemoved += F.size()-Reachable.size(); + NumRemoved += F.size() - Reachable.size(); SmallSetVector DeadBlockSet; - for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ++I) { - auto *BB = &*I; - if (Reachable.count(BB)) + for (BasicBlock &BB : F) { + // Skip reachable basic blocks + if (Reachable.find(&BB) != Reachable.end()) continue; - DeadBlockSet.insert(BB); + DeadBlockSet.insert(&BB); } if (MSSAU) MSSAU->removeBlocks(DeadBlockSet); // Loop over all of the basic blocks that are not reachable, dropping all of - // their internal references. Update DTU and LVI if available. + // their internal references. Update DTU if available. std::vector Updates; for (auto *BB : DeadBlockSet) { for (BasicBlock *Successor : successors(BB)) { @@ -2246,26 +2244,18 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, if (DTU) Updates.push_back({DominatorTree::Delete, BB, Successor}); } - if (LVI) - LVI->eraseBlock(BB); BB->dropAllReferences(); - } - for (Function::iterator I = ++F.begin(); I != F.end();) { - auto *BB = &*I; - if (Reachable.count(BB)) { - ++I; - continue; - } if (DTU) { - // Remove the terminator of BB to clear the successor list of BB. - if (BB->getTerminator()) - BB->getInstList().pop_back(); + Instruction *TI = BB->getTerminator(); + assert(TI && "Basic block should have a terminator"); + // Terminators like invoke can have users. We have to replace their users, + // before removing them. + if (!TI->use_empty()) + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + TI->eraseFromParent(); new UnreachableInst(BB->getContext(), BB); assert(succ_empty(BB) && "The successor list of BB isn't empty before " "applying corresponding DTU updates."); - ++I; - } else { - I = F.getBasicBlockList().erase(I); } } @@ -2281,7 +2271,11 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, } if (!Deleted) return false; + } else { + for (auto *BB : DeadBlockSet) + BB->eraseFromParent(); } + return true; } diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 3c288bab3779ff..44859eafb9c192 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -556,7 +556,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, if (isa(ValInfo)) { IRBuilder<> B(getBranchTerminator(ValInfo)); Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); - if (empty(IF->users())) + if (IF->users().empty()) CreatedDeclarations.insert(IF); CallInst *PIC = B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++)); @@ -568,7 +568,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, "Should not have gotten here without it being an assume"); IRBuilder<> B(PAssume->AssumeInst); Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); - if (empty(IF->users())) + if (IF->users().empty()) CreatedDeclarations.insert(IF); CallInst *PIC = B.CreateCall(IF, Op); PredicateMap.insert({PIC, ValInfo}); diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 7352ce83adb43d..ccdc24d48c761d 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3,6 +3,8 @@ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// Notified per clause 4(b) of the license. // //===----------------------------------------------------------------------===// // @@ -94,12 +96,6 @@ static cl::opt PHINodeFoldingThreshold( cl::desc( "Control the amount of phi node folding to perform (default = 2)")); -static cl::opt TwoEntryPHINodeFoldingThreshold( - "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4), - cl::desc("Control the maximal total instruction cost that we are willing " - "to speculatively execute to fold a 2-entry PHI node into a " - "select (default = 4)")); - static cl::opt DupRet( "simplifycfg-dup-ret", cl::Hidden, cl::init(false), cl::desc("Duplicate return instructions into unconditional branches")); @@ -338,7 +334,7 @@ static unsigned ComputeSpeculationCost(const User *I, /// CostRemaining, false is returned and CostRemaining is undefined. static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSetImpl &AggressiveInsts, - int &BudgetRemaining, + unsigned &CostRemaining, const TargetTransformInfo &TTI, unsigned Depth = 0) { // It is possible to hit a zero-cost cycle (phi/gep instructions for example), @@ -381,7 +377,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, if (!isSafeToSpeculativelyExecute(I)) return false; - BudgetRemaining -= ComputeSpeculationCost(I, TTI); + unsigned Cost = ComputeSpeculationCost(I, TTI); // Allow exactly one instruction to be speculated regardless of its cost // (as long as it is safe to do so). @@ -389,14 +385,17 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // or other expensive operation. The speculation of an expensive instruction // is expected to be undone in CodeGenPrepare if the speculation has not // enabled further IR optimizations. - if (BudgetRemaining < 0 && + if (Cost > CostRemaining && (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0)) return false; + // Avoid unsigned wrap. + CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost; + // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI, + if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI, Depth + 1)) return false; // Okay, it's safe to do this! Remember this instruction. @@ -2324,8 +2323,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // instructions. While we are at it, keep track of the instructions // that need to be moved to the dominating block. SmallPtrSet AggressiveInsts; - int BudgetRemaining = - TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; + unsigned MaxCostVal0 = PHINodeFoldingThreshold, + MaxCostVal1 = PHINodeFoldingThreshold; + MaxCostVal0 *= TargetTransformInfo::TCC_Basic; + MaxCostVal1 *= TargetTransformInfo::TCC_Basic; for (BasicBlock::iterator II = BB->begin(); isa(II);) { PHINode *PN = cast(II++); @@ -2336,9 +2337,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts, - BudgetRemaining, TTI) || + MaxCostVal0, TTI) || !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts, - BudgetRemaining, TTI)) + MaxCostVal1, TTI)) return false; } @@ -3087,15 +3088,15 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, // store that doesn't execute. if (MinAlignment != 0) { // Choose the minimum of all non-zero alignments. - SI->setAlignment(MinAlignment); + SI->setAlignment(Align(MinAlignment)); } else if (MaxAlignment != 0) { // Choose the minimal alignment between the non-zero alignment and the ABI // default alignment for the type of the stored value. - SI->setAlignment(std::min(MaxAlignment, TypeAlignment)); + SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment))); } else { // If both alignments are zero, use ABI default alignment for the type of // the stored value. - SI->setAlignment(TypeAlignment); + SI->setAlignment(Align(TypeAlignment)); } QStore->eraseFromParent(); @@ -5314,7 +5315,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // Figure out the corresponding result for each case value and phi node in the // common destination, as well as the min and max case values. - assert(!empty(SI->cases())); + assert(!SI->cases().empty()); SwitchInst::CaseIt CI = SI->case_begin(); ConstantInt *MinCaseVal = CI->getCaseValue(); ConstantInt *MaxCaseVal = CI->getCaseValue(); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index adfe73fed3086f..e4f28fc2b30cb2 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2796,6 +2796,12 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { return nullptr; } +Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) { + // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) + return B.CreateMemMove(CI->getArgOperand(1), 1, CI->getArgOperand(0), 1, + CI->getArgOperand(2)); +} + bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { LibFunc Func; SmallString<20> FloatFuncName = FuncName; @@ -2874,6 +2880,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeRealloc(CI, Builder); case LibFunc_wcslen: return optimizeWcslen(CI, Builder); + case LibFunc_bcopy: + return optimizeBCopy(CI, Builder); default: break; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 196dbe12b876d9..99428c6c5dee38 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4024,7 +4024,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - ST->setAlignment(Alignment); + ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), diff --git a/llvm/projects/CMakeLists.txt b/llvm/projects/CMakeLists.txt index 47ceb9ea2f339e..7a948bdc8506b4 100644 --- a/llvm/projects/CMakeLists.txt +++ b/llvm/projects/CMakeLists.txt @@ -31,6 +31,7 @@ if(${LLVM_BUILD_RUNTIME}) # dependent projects can see the target names of their dependencies. add_llvm_external_project(libunwind) add_llvm_external_project(pstl) + add_llvm_external_project(libc) add_llvm_external_project(libcxxabi) add_llvm_external_project(libcxx) endif() diff --git a/llvm/test/Analysis/MemorySSA/pr43427.ll b/llvm/test/Analysis/MemorySSA/pr43427.ll new file mode 100644 index 00000000000000..f7088782217177 --- /dev/null +++ b/llvm/test/Analysis/MemorySSA/pr43427.ll @@ -0,0 +1,42 @@ +; RUN: opt -disable-output -licm -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s + +; CHECK-LABEL: @f() +; CHECK: 8 = MemoryPhi( +; CHECK: 7 = MemoryPhi( +; CHECK: 9 = MemoryPhi( +define void @f() { +entry: + %e = alloca i16, align 1 + br label %lbl1 + +lbl1: ; preds = %if.else, %cleanup, %entry + store i16 undef, i16* %e, align 1 + call void @g() + br i1 undef, label %for.end, label %if.else + +for.end: ; preds = %lbl1 + br i1 undef, label %lbl3, label %lbl2 + +lbl2: ; preds = %lbl3, %for.end + br label %lbl3 + +lbl3: ; preds = %lbl2, %for.end + br i1 undef, label %lbl2, label %cleanup + +cleanup: ; preds = %lbl3 + %cleanup.dest = load i32, i32* undef, align 1 + %switch = icmp ult i32 %cleanup.dest, 1 + br i1 %switch, label %cleanup.cont, label %lbl1 + +cleanup.cont: ; preds = %cleanup + call void @llvm.lifetime.end.p0i8(i64 1, i8* null) + ret void + +if.else: ; preds = %lbl1 + br label %lbl1 +} + +declare void @g() + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) diff --git a/llvm/test/Analysis/MemorySSA/pr43540.ll b/llvm/test/Analysis/MemorySSA/pr43540.ll new file mode 100644 index 00000000000000..325e6bc0ae8f74 --- /dev/null +++ b/llvm/test/Analysis/MemorySSA/pr43540.ll @@ -0,0 +1,34 @@ +; RUN: opt -S -licm -enable-mssa-loop-dependency=true %s | FileCheck %s +@v_1 = global i8 0, align 1 +@v_2 = global i8 0, align 1 + +; CHECK-LABEL: @foo() +; CHECK: for.cond: +; CHECK-NOT: store +; CHECK: for.body: +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +; CHECK: store +define void @foo() { +entry: + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %0 = phi i16 [ %inc, %for.body ], [ 0, %entry ] + %cmp = icmp slt i16 %0, 1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + call void @llvm.memcpy.p0i8.p0i8.i64(i8* @v_1, i8 * @v_2, i64 1, i1 false) + store i8 1, i8 * @v_2, align 1 + %inc = add nsw i16 %0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8 * noalias nocapture readonly, i64, i1 immarg) #2 + +attributes #2 = { argmemonly nounwind willreturn } + diff --git a/llvm/test/Bindings/llvm-c/debug_info.ll b/llvm/test/Bindings/llvm-c/debug_info.ll index 6cddd1ac29a0ff..af682fdf19420e 100644 --- a/llvm/test/Bindings/llvm-c/debug_info.ll +++ b/llvm/test/Bindings/llvm-c/debug_info.ll @@ -3,13 +3,13 @@ ; CHECK: ; ModuleID = 'debuginfo.c' ; CHECK-NEXT: source_filename = "debuginfo.c" -; CHECK: define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !20 { +; CHECK: define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !31 { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.dbg.declare(metadata i64 0, metadata !27, metadata !DIExpression()), !dbg !32 -; CHECK-NEXT: call void @llvm.dbg.declare(metadata i64 0, metadata !28, metadata !DIExpression()), !dbg !32 -; CHECK-NEXT: call void @llvm.dbg.declare(metadata i64 0, metadata !29, metadata !DIExpression()), !dbg !32 +; CHECK-NEXT: call void @llvm.dbg.declare(metadata i64 0, metadata !38, metadata !DIExpression()), !dbg !43 +; CHECK-NEXT: call void @llvm.dbg.declare(metadata i64 0, metadata !39, metadata !DIExpression()), !dbg !43 +; CHECK-NEXT: call void @llvm.dbg.declare(metadata i64 0, metadata !40, metadata !DIExpression()), !dbg !43 ; CHECK: vars: ; No predecessors! -; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 0, metadata !30, metadata !DIExpression(DW_OP_constu, 0, DW_OP_stack_value)), !dbg !33 +; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 0, metadata !41, metadata !DIExpression(DW_OP_constu, 0, DW_OP_stack_value)), !dbg !44 ; CHECK-NEXT: } ; CHECK: ; Function Attrs: nounwind readnone speculatable @@ -21,39 +21,51 @@ ; CHECK: attributes #0 = { nounwind readnone speculatable willreturn } ; CHECK: !llvm.dbg.cu = !{!0} -; CHECK-NEXT: !FooType = !{!16} +; CHECK-NEXT: !FooType = !{!28} +; CHECK-NEXT: !EnumTest = !{!3} -; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3, imports: !12, splitDebugInlining: false) +; CHECK: !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !11, imports: !19, macros: !23, splitDebugInlining: false) ; CHECK-NEXT: !1 = !DIFile(filename: "debuginfo.c", directory: ".") -; CHECK-NEXT: !2 = !{} -; CHECK-NEXT: !3 = !{!4, !8} -; CHECK-NEXT: !4 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value)) -; CHECK-NEXT: !5 = distinct !DIGlobalVariable(name: "globalClass", scope: !6, file: !1, line: 1, type: !7, isLocal: true, isDefinition: true) -; CHECK-NEXT: !6 = !DIModule(scope: null, name: "llvm-c-test", includePath: "/test/include/llvm-c-test.h") -; CHECK-NEXT: !7 = !DICompositeType(tag: DW_TAG_structure_type, name: "TestClass", scope: !1, file: !1, line: 42, size: 64, flags: DIFlagObjcClassComplete, elements: !2) -; CHECK-NEXT: !8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value)) -; CHECK-NEXT: !9 = distinct !DIGlobalVariable(name: "global", scope: !6, file: !1, line: 1, type: !10, isLocal: true, isDefinition: true) -; CHECK-NEXT: !10 = !DIDerivedType(tag: DW_TAG_typedef, name: "int64_t", scope: !1, file: !1, line: 42, baseType: !11) -; CHECK-NEXT: !11 = !DIBasicType(name: "Int64", size: 64) -; CHECK-NEXT: !12 = !{!13, !15} -; CHECK-NEXT: !13 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !6, entity: !14, file: !1, line: 42) -; CHECK-NEXT: !14 = !DIModule(scope: null, name: "llvm-c-test-import", includePath: "/test/include/llvm-c-test-import.h") -; CHECK-NEXT: !15 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !6, entity: !13, file: !1, line: 42) -; CHECK-NEXT: !16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 192, dwarfAddressSpace: 0) -; CHECK-NEXT: !17 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !18, file: !1, size: 192, elements: !19, runtimeLang: DW_LANG_C89, identifier: "MyStruct") -; CHECK-NEXT: !18 = !DINamespace(name: "NameSpace", scope: !6) -; CHECK-NEXT: !19 = !{!11, !11, !11} -; CHECK-NEXT: !20 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !21, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !26) -; CHECK-NEXT: !21 = !DISubroutineType(types: !22) -; CHECK-NEXT: !22 = !{!11, !11, !23} -; CHECK-NEXT: !23 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 640, flags: DIFlagVector, elements: !24) -; CHECK-NEXT: !24 = !{!25} -; CHECK-NEXT: !25 = !DISubrange(count: 10) -; CHECK-NEXT: !26 = !{!27, !28, !29, !30} -; CHECK-NEXT: !27 = !DILocalVariable(name: "a", arg: 1, scope: !20, file: !1, line: 42, type: !11) -; CHECK-NEXT: !28 = !DILocalVariable(name: "b", arg: 2, scope: !20, file: !1, line: 42, type: !11) -; CHECK-NEXT: !29 = !DILocalVariable(name: "c", arg: 3, scope: !20, file: !1, line: 42, type: !23) -; CHECK-NEXT: !30 = !DILocalVariable(name: "d", scope: !31, file: !1, line: 43, type: !11) -; CHECK-NEXT: !31 = distinct !DILexicalBlock(scope: !20, file: !1, line: 42) -; CHECK-NEXT: !32 = !DILocation(line: 42, scope: !20) -; CHECK-NEXT: !33 = !DILocation(line: 43, scope: !20) +; CHECK-NEXT: !2 = !{!3} +; CHECK-NEXT: !3 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "EnumTest", scope: !4, file: !1, baseType: !6, size: 64, elements: !7) +; CHECK-NEXT: !4 = !DINamespace(name: "NameSpace", scope: !5) +; CHECK-NEXT: !5 = !DIModule(scope: null, name: "llvm-c-test", includePath: "/test/include/llvm-c-test.h") +; CHECK-NEXT: !6 = !DIBasicType(name: "Int64", size: 64) +; CHECK-NEXT: !7 = !{!8, !9, !10} +; CHECK-NEXT: !8 = !DIEnumerator(name: "Test_A", value: 0, isUnsigned: true) +; CHECK-NEXT: !9 = !DIEnumerator(name: "Test_B", value: 1, isUnsigned: true) +; CHECK-NEXT: !10 = !DIEnumerator(name: "Test_B", value: 2, isUnsigned: true) +; CHECK-NEXT: !11 = !{!12, !16} +; CHECK-NEXT: !12 = !DIGlobalVariableExpression(var: !13, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value)) +; CHECK-NEXT: !13 = distinct !DIGlobalVariable(name: "globalClass", scope: !5, file: !1, line: 1, type: !14, isLocal: true, isDefinition: true) +; CHECK-NEXT: !14 = !DICompositeType(tag: DW_TAG_structure_type, name: "TestClass", scope: !1, file: !1, line: 42, size: 64, flags: DIFlagObjcClassComplete, elements: !15) +; CHECK-NEXT: !15 = !{} +; CHECK-NEXT: !16 = !DIGlobalVariableExpression(var: !17, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value)) +; CHECK-NEXT: !17 = distinct !DIGlobalVariable(name: "global", scope: !5, file: !1, line: 1, type: !18, isLocal: true, isDefinition: true) +; CHECK-NEXT: !18 = !DIDerivedType(tag: DW_TAG_typedef, name: "int64_t", scope: !1, file: !1, line: 42, baseType: !6) +; CHECK-NEXT: !19 = !{!20, !22} +; CHECK-NEXT: !20 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !21, file: !1, line: 42) +; CHECK-NEXT: !21 = !DIModule(scope: null, name: "llvm-c-test-import", includePath: "/test/include/llvm-c-test-import.h") +; CHECK-NEXT: !22 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !20, file: !1, line: 42) +; CHECK-NEXT: !23 = !{!24} +; CHECK-NEXT: !24 = !DIMacroFile(file: !1, nodes: !25) +; CHECK-NEXT: !25 = !{!26, !27} +; CHECK-NEXT: !26 = !DIMacro(type: DW_MACINFO_define, name: "SIMPLE_DEFINE") +; CHECK-NEXT: !27 = !DIMacro(type: DW_MACINFO_define, name: "VALUE_DEFINE", value: "1") +; CHECK-NEXT: !28 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !29, size: 192, dwarfAddressSpace: 0) +; CHECK-NEXT: !29 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !30, runtimeLang: DW_LANG_C89, identifier: "MyStruct") +; CHECK-NEXT: !30 = !{!6, !6, !6} +; CHECK-NEXT: !31 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !32, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !37) +; CHECK-NEXT: !32 = !DISubroutineType(types: !33) +; CHECK-NEXT: !33 = !{!6, !6, !34} +; CHECK-NEXT: !34 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !35) +; CHECK-NEXT: !35 = !{!36} +; CHECK-NEXT: !36 = !DISubrange(count: 10) +; CHECK-NEXT: !37 = !{!38, !39, !40, !41} +; CHECK-NEXT: !38 = !DILocalVariable(name: "a", arg: 1, scope: !31, file: !1, line: 42, type: !6) +; CHECK-NEXT: !39 = !DILocalVariable(name: "b", arg: 2, scope: !31, file: !1, line: 42, type: !6) +; CHECK-NEXT: !40 = !DILocalVariable(name: "c", arg: 3, scope: !31, file: !1, line: 42, type: !34) +; CHECK-NEXT: !41 = !DILocalVariable(name: "d", scope: !42, file: !1, line: 43, type: !6) +; CHECK-NEXT: !42 = distinct !DILexicalBlock(scope: !31, file: !1, line: 42) +; CHECK-NEXT: !43 = !DILocation(line: 42, scope: !31) +; CHECK-NEXT: !44 = !DILocation(line: 43, scope: !31) diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir new file mode 100644 index 00000000000000..9009a6a29bf683 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -0,0 +1,121 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# +# Test allocation and deallocation of SVE objects on the stack, +# as well as using a combination of scalable and non-scalable +# offsets to access the SVE on the stack. +# +# SVE objects are allocated below the (scalar) callee saves, +# and above spills/locals and the alignment gap, e.g. +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | Callee Saves| +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | SVE area | +# +-------------+ +# |/////////////| alignment gap. +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +--- | + + define void @test_allocate_sve() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_callee_saves() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_realigned() nounwind { entry: unreachable } + +... +# +----------+ +# | %fixed- | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | stack.0 | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve +# CHECK: stackSize: 16 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 + +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + RET_ReallyLR +--- +... +# +----------+ +# | x20, x21 | // callee saves +# +----------+ +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve_gpr_callee_saves +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $x21, killed $x20, $sp, -2 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $x20 = IMPLICIT_DEF +# CHECK-NEXT: $x21 = IMPLICIT_DEF +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: $sp, $x21, $x20 = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_callee_saves +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + RET_ReallyLR +--- +... +# +----------+ +# | lr, fp | // frame record +# +----------+ <- FP +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# |//////////| // alignment gap +# | %stack.0 | // not scalable +# +----------+ <- SP +# CHECK-LABEL: name: test_allocate_sve_gpr_realigned +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_realigned +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + RET_ReallyLR +--- diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-noreturn.mir b/llvm/test/CodeGen/AArch64/machine-outliner-noreturn.mir new file mode 100644 index 00000000000000..29166f9f12d82b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-outliner-noreturn.mir @@ -0,0 +1,56 @@ +# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define void @foo() #0 { ret void } + define void @bar(i32 %a) #0 { ret void } + define void @baz(i32 %a) #0 { ret void } + attributes #0 = { noredzone noreturn } +... +--- + +# Temporarily disable outlining from noreturn functions. To do this, we need +# to verify thst every function we want to outline from is noreturn. + +# CHECK-NOT: OUTLINED_FUNCTION + +name: foo +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0: + $w3 = ORRWri $wzr, 1 + $w4 = ORRWri $wzr, 1 + BRK 1 +... +--- +name: bar +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0: + $w3 = ORRWri $wzr, 1 + $w4 = ORRWri $wzr, 1 + BRK 1 +... +--- +name: baz +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +machineFunctionInfo: {} +body: | + bb.0: + $w3 = ORRWri $wzr, 1 + $w4 = ORRWri $wzr, 1 + BRK 1 +... diff --git a/llvm/test/CodeGen/AArch64/srem-lkk.ll b/llvm/test/CodeGen/AArch64/srem-lkk.ll new file mode 100644 index 00000000000000..321791e9228fce --- /dev/null +++ b/llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @fold_srem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positive_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; CHECK-LABEL: fold_srem_positive_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #36849 +; CHECK-NEXT: movk w8, #15827, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #40 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #1060 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #65445 +; CHECK-NEXT: movk w8, #42330, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #40 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #-723 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #62439 +; CHECK-NEXT: movk w8, #64805, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #40 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #-22981 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w9, w8, w9, w0 +; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, #63 // =63 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0xffffffc0 +; CHECK-NEXT: sub w0, w0, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: add w8, w0, w8 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w8, w0, lt +; CHECK-NEXT: and w8, w8, #0x80000000 +; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: ret + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #58849 +; CHECK-NEXT: movk x8, #48148, lsl #16 +; CHECK-NEXT: movk x8, #33436, lsl #32 +; CHECK-NEXT: movk x8, #21399, lsl #48 +; CHECK-NEXT: smulh x8, x0, x8 +; CHECK-NEXT: asr x9, x8, #5 +; CHECK-NEXT: add x8, x9, x8, lsr #63 +; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: msub x0, x8, x9, x0 +; CHECK-NEXT: ret + %1 = srem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll new file mode 100644 index 00000000000000..5597e16576ccc2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -0,0 +1,324 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: fold_srem_vec_1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #63421 +; CHECK-NEXT: mov w12, #33437 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: movk w9, #31710, lsl #16 +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: movk w12, #21399, lsl #16 +; CHECK-NEXT: smull x12, w11, w12 +; CHECK-NEXT: smull x9, w8, w9 +; CHECK-NEXT: lsr x13, x12, #63 +; CHECK-NEXT: asr x12, x12, #37 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w12, w12, w13 +; CHECK-NEXT: mov w13, #98 +; CHECK-NEXT: sub w9, w9, w8 +; CHECK-NEXT: msub w11, w12, w13, w11 +; CHECK-NEXT: asr w13, w9, #6 +; CHECK-NEXT: add w9, w13, w9, lsr #31 +; CHECK-NEXT: mov w13, #37253 +; CHECK-NEXT: mov w10, #-124 +; CHECK-NEXT: smov w12, v0.h[0] +; CHECK-NEXT: movk w13, #44150, lsl #16 +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: smull x10, w12, w13 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: add w10, w10, w12 +; CHECK-NEXT: asr w13, w10, #6 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: add w10, w13, w10, lsr #31 +; CHECK-NEXT: msub w9, w10, w9, w12 +; CHECK-NEXT: mov w10, #63249 +; CHECK-NEXT: smov w13, v0.h[3] +; CHECK-NEXT: movk w10, #48808, lsl #16 +; CHECK-NEXT: smull x10, w13, w10 +; CHECK-NEXT: lsr x12, x10, #63 +; CHECK-NEXT: asr x10, x10, #40 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add w10, w10, w12 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #-1003 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: msub w8, w10, w8, w13 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: fold_srem_vec_2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #37253 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: movk w9, #44150, lsl #16 +; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: smull x13, w8, w9 +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: smull x14, w10, w9 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: smull x15, w11, w9 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: add w13, w13, w8 +; CHECK-NEXT: smull x9, w12, w9 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w14, w14, w10 +; CHECK-NEXT: asr w16, w13, #6 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w15, w15, w11 +; CHECK-NEXT: add w13, w16, w13, lsr #31 +; CHECK-NEXT: asr w16, w14, #6 +; CHECK-NEXT: add w9, w9, w12 +; CHECK-NEXT: add w14, w16, w14, lsr #31 +; CHECK-NEXT: asr w16, w15, #6 +; CHECK-NEXT: add w15, w16, w15, lsr #31 +; CHECK-NEXT: asr w16, w9, #6 +; CHECK-NEXT: add w9, w16, w9, lsr #31 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: msub w8, w13, w16, w8 +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: msub w8, w9, w16, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smov w9, v0.h[1] +; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: smull x13, w9, w8 +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: smull x14, w10, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: smull x15, w11, w8 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: add w13, w13, w9 +; CHECK-NEXT: smull x8, w12, w8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w14, w14, w10 +; CHECK-NEXT: asr w16, w13, #6 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w15, w15, w11 +; CHECK-NEXT: add w13, w16, w13, lsr #31 +; CHECK-NEXT: asr w16, w14, #6 +; CHECK-NEXT: add w8, w8, w12 +; CHECK-NEXT: add w14, w16, w14, lsr #31 +; CHECK-NEXT: asr w16, w15, #6 +; CHECK-NEXT: add w15, w16, w15, lsr #31 +; CHECK-NEXT: asr w16, w8, #6 +; CHECK-NEXT: add w8, w16, w8, lsr #31 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: msub w9, w13, w16, w9 +; CHECK-NEXT: fmov s0, w14 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w13 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: msub w12, w8, w16, w12 +; CHECK-NEXT: mov v0.h[2], w15 +; CHECK-NEXT: mov v1.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: add w12, w8, #31 // =31 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: mov w11, #37253 +; CHECK-NEXT: csel w12, w12, w8, lt +; CHECK-NEXT: smov w9, v0.h[0] +; CHECK-NEXT: smov w10, v0.h[3] +; CHECK-NEXT: movk w11, #44150, lsl #16 +; CHECK-NEXT: and w12, w12, #0xffffffe0 +; CHECK-NEXT: sub w8, w8, w12 +; CHECK-NEXT: add w12, w9, #63 // =63 +; CHECK-NEXT: smull x11, w10, w11 +; CHECK-NEXT: cmp w9, #0 // =0 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: csel w12, w12, w9, lt +; CHECK-NEXT: add w11, w11, w10 +; CHECK-NEXT: and w12, w12, #0xffffffc0 +; CHECK-NEXT: sub w9, w9, w12 +; CHECK-NEXT: asr w12, w11, #6 +; CHECK-NEXT: add w11, w12, w11, lsr #31 +; CHECK-NEXT: smov w12, v0.h[2] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: add w9, w12, #7 // =7 +; CHECK-NEXT: cmp w12, #0 // =0 +; CHECK-NEXT: csel w9, w9, w12, lt +; CHECK-NEXT: and w9, w9, #0xfffffff8 +; CHECK-NEXT: sub w9, w12, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #95 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: msub w8, w11, w8, w10 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #17097 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: movk w9, #45590, lsl #16 +; CHECK-NEXT: smull x9, w8, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w9, w9, w8 +; CHECK-NEXT: asr w12, w9, #4 +; CHECK-NEXT: add w9, w12, w9, lsr #31 +; CHECK-NEXT: mov w12, #30865 +; CHECK-NEXT: mov w10, #23 +; CHECK-NEXT: smov w11, v0.h[1] +; CHECK-NEXT: movk w12, #51306, lsl #16 +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: smull x10, w11, w12 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: asr w12, w10, #9 +; CHECK-NEXT: mov w9, #654 +; CHECK-NEXT: add w10, w12, w10, lsr #31 +; CHECK-NEXT: msub w9, w10, w9, w11 +; CHECK-NEXT: mov w10, #47143 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: movk w10, #24749, lsl #16 +; CHECK-NEXT: smull x10, w12, w10 +; CHECK-NEXT: lsr x11, x10, #63 +; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: msub w8, w10, w9, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_i16_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w10, #17097 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w9, v0.h[2] +; CHECK-NEXT: movk w10, #45590, lsl #16 +; CHECK-NEXT: smull x10, w9, w10 +; CHECK-NEXT: lsr x10, x10, #32 +; CHECK-NEXT: add w10, w10, w9 +; CHECK-NEXT: asr w12, w10, #4 +; CHECK-NEXT: mov w11, #23 +; CHECK-NEXT: add w10, w12, w10, lsr #31 +; CHECK-NEXT: msub w9, w10, w11, w9 +; CHECK-NEXT: mov w10, #47143 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: movk w10, #24749, lsl #16 +; CHECK-NEXT: smull x10, w12, w10 +; CHECK-NEXT: lsr x11, x10, #63 +; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: mov w11, #32767 +; CHECK-NEXT: add w11, w8, w11 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: and w11, w11, #0xffff8000 +; CHECK-NEXT: sub w8, w8, w11 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #5423 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: msub w8, w10, w8, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #6055 +; CHECK-NEXT: movk x9, #58853, lsl #16 +; CHECK-NEXT: movk x9, #47142, lsl #32 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: movk x9, #24749, lsl #48 +; CHECK-NEXT: smulh x9, x8, x9 +; CHECK-NEXT: asr x12, x9, #11 +; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: add x9, x12, x9, lsr #63 +; CHECK-NEXT: msub x8, x9, x10, x8 +; CHECK-NEXT: mov x9, #21445 +; CHECK-NEXT: movk x9, #1603, lsl #16 +; CHECK-NEXT: movk x9, #15432, lsl #32 +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: movk x9, #25653, lsl #48 +; CHECK-NEXT: smulh x9, x12, x9 +; CHECK-NEXT: asr x10, x9, #8 +; CHECK-NEXT: add x9, x10, x9, lsr #63 +; CHECK-NEXT: mov w10, #654 +; CHECK-NEXT: msub x9, x9, x10, x12 +; CHECK-NEXT: mov x10, #8549 +; CHECK-NEXT: movk x10, #22795, lsl #16 +; CHECK-NEXT: movk x10, #17096, lsl #32 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: movk x10, #45590, lsl #48 +; CHECK-NEXT: smulh x10, x11, x10 +; CHECK-NEXT: add x10, x10, x11 +; CHECK-NEXT: asr x12, x10, #4 +; CHECK-NEXT: add x10, x12, x10, lsr #63 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: msub x10, x10, x12, x11 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: ret + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} diff --git a/llvm/test/CodeGen/AArch64/sve-fp.ll b/llvm/test/CodeGen/AArch64/sve-fp.ll new file mode 100644 index 00000000000000..08913858886f71 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fp.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define @fadd_h( %a, %b) { +; CHECK-LABEL: fadd_h: +; CHECK: fadd z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = fadd %a, %b + ret %res +} + +define @fadd_s( %a, %b) { +; CHECK-LABEL: fadd_s: +; CHECK: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = fadd %a, %b + ret %res +} + +define @fadd_d( %a, %b) { +; CHECK-LABEL: fadd_d: +; CHECK: fadd z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %res = fadd %a, %b + ret %res +} diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-bits.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-bits.ll new file mode 100644 index 00000000000000..2350353a27424f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-bits.ll @@ -0,0 +1,83 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; CNT +; + +define @cnt_i8( %a, %pg, %b) { +; CHECK-LABEL: cnt_i8: +; CHECK: cnt z0.b, p0/m, z1.b +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.cnt.nxv16i8( %a, + %pg, + %b) + ret %out +} + +define @cnt_i16( %a, %pg, %b) { +; CHECK-LABEL: cnt_i16: +; CHECK: cnt z0.h, p0/m, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.cnt.nxv8i16( %a, + %pg, + %b) + ret %out +} + +define @cnt_i32( %a, %pg, %b) { +; CHECK-LABEL: cnt_i32: +; CHECK: cnt z0.s, p0/m, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.cnt.nxv4i32( %a, + %pg, + %b) + ret %out +} + +define @cnt_i64( %a, %pg, %b) { +; CHECK-LABEL: cnt_i64: +; CHECK: cnt z0.d, p0/m, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.cnt.nxv2i64( %a, + %pg, + %b) + ret %out +} + +define @cnt_f16( %a, %pg, %b) { +; CHECK-LABEL: cnt_f16: +; CHECK: cnt z0.h, p0/m, z1.h +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.cnt.nxv8f16( %a, + %pg, + %b) + ret %out +} + +define @cnt_f32( %a, %pg, %b) { +; CHECK-LABEL: cnt_f32: +; CHECK: cnt z0.s, p0/m, z1.s +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.cnt.nxv4f32( %a, + %pg, + %b) + ret %out +} + +define @cnt_f64( %a, %pg, %b) { +; CHECK-LABEL: cnt_f64: +; CHECK: cnt z0.d, p0/m, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.cnt.nxv2f64( %a, + %pg, + %b) + ret %out +} + +declare @llvm.aarch64.sve.cnt.nxv16i8(, , ) +declare @llvm.aarch64.sve.cnt.nxv8i16(, , ) +declare @llvm.aarch64.sve.cnt.nxv4i32(, , ) +declare @llvm.aarch64.sve.cnt.nxv2i64(, , ) +declare @llvm.aarch64.sve.cnt.nxv8f16(, , ) +declare @llvm.aarch64.sve.cnt.nxv4f32(, , ) +declare @llvm.aarch64.sve.cnt.nxv2f64(, , ) diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll new file mode 100644 index 00000000000000..3d7f309ddaf315 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @fold_urem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positive_odd: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w9, w0, w8 +; CHECK-NEXT: add w8, w8, w9, lsr #1 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16323 +; CHECK-NEXT: movk w8, #63310, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #42 +; CHECK-NEXT: mov w9, #1060 +; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w9, w0, w8 +; CHECK-NEXT: add w8, w8, w9, lsr #1 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: msub w9, w8, w9, w0 +; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: ret + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: and w0, w0, #0x3f +; CHECK-NEXT: ret + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #58849 +; CHECK-NEXT: movk x9, #48148, lsl #16 +; CHECK-NEXT: movk x9, #33436, lsl #32 +; CHECK-NEXT: lsr x8, x0, #1 +; CHECK-NEXT: movk x9, #21399, lsl #48 +; CHECK-NEXT: umulh x8, x8, x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: msub x0, x8, x9, x0 +; CHECK-NEXT: ret + %1 = urem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll new file mode 100644 index 00000000000000..c5951a4993fc37 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -0,0 +1,267 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; CHECK-LABEL: fold_urem_vec_1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w11, #33437 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: movk w11, #21399, lsl #16 +; CHECK-NEXT: umull x11, w10, w11 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov w9, #16913 +; CHECK-NEXT: mov w12, #98 +; CHECK-NEXT: lsr x11, x11, #37 +; CHECK-NEXT: movk w9, #8456, lsl #16 +; CHECK-NEXT: msub w10, w11, w12, w10 +; CHECK-NEXT: ubfx w12, w8, #2, #14 +; CHECK-NEXT: umull x9, w12, w9 +; CHECK-NEXT: mov w11, #124 +; CHECK-NEXT: lsr x9, x9, #34 +; CHECK-NEXT: msub w8, w9, w11, w8 +; CHECK-NEXT: mov w9, #8969 +; CHECK-NEXT: umov w12, v0.h[0] +; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: umull x9, w12, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: sub w11, w12, w9 +; CHECK-NEXT: add w9, w9, w11, lsr #1 +; CHECK-NEXT: mov w11, #95 +; CHECK-NEXT: lsr w9, w9, #6 +; CHECK-NEXT: msub w9, w9, w11, w12 +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: mov w9, #2287 +; CHECK-NEXT: movk w9, #16727, lsl #16 +; CHECK-NEXT: umull x9, w11, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, #1003 +; CHECK-NEXT: lsr x9, x9, #40 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: msub w8, w9, w8, w11 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; CHECK-LABEL: fold_urem_vec_2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #8969 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umull x13, w8, w9 +; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: umull x14, w10, w9 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: umull x15, w11, w9 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: sub w16, w8, w13 +; CHECK-NEXT: umull x9, w12, w9 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w13, w13, w16, lsr #1 +; CHECK-NEXT: sub w16, w10, w14 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: add w14, w14, w16, lsr #1 +; CHECK-NEXT: sub w16, w11, w15 +; CHECK-NEXT: add w15, w15, w16, lsr #1 +; CHECK-NEXT: sub w16, w12, w9 +; CHECK-NEXT: add w9, w9, w16, lsr #1 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: lsr w13, w13, #6 +; CHECK-NEXT: msub w8, w13, w16, w8 +; CHECK-NEXT: lsr w13, w14, #6 +; CHECK-NEXT: msub w10, w13, w16, w10 +; CHECK-NEXT: lsr w13, w15, #6 +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: msub w11, w13, w16, w11 +; CHECK-NEXT: lsr w9, w9, #6 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: msub w8, w9, w16, w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umull x13, w9, w8 +; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: umull x14, w10, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: umull x15, w11, w8 +; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: sub w16, w9, w13 +; CHECK-NEXT: umull x8, w12, w8 +; CHECK-NEXT: lsr x15, x15, #32 +; CHECK-NEXT: add w13, w13, w16, lsr #1 +; CHECK-NEXT: sub w16, w10, w14 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w14, w14, w16, lsr #1 +; CHECK-NEXT: sub w16, w11, w15 +; CHECK-NEXT: add w15, w15, w16, lsr #1 +; CHECK-NEXT: sub w16, w12, w8 +; CHECK-NEXT: add w8, w8, w16, lsr #1 +; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: lsr w14, w14, #6 +; CHECK-NEXT: lsr w13, w13, #6 +; CHECK-NEXT: msub w10, w14, w16, w10 +; CHECK-NEXT: lsr w15, w15, #6 +; CHECK-NEXT: msub w9, w13, w16, w9 +; CHECK-NEXT: fmov s0, w14 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: msub w11, w15, w16, w11 +; CHECK-NEXT: mov v0.h[1], w13 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: msub w12, w8, w16, w12 +; CHECK-NEXT: mov v0.h[2], w15 +; CHECK-NEXT: mov v1.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w12 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #8969 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: umull x9, w8, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: sub w10, w8, w9 +; CHECK-NEXT: add w9, w9, w10, lsr #1 +; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: lsr w9, w9, #6 +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: and w9, w9, #0x3f +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: and w10, w10, #0x1f +; CHECK-NEXT: and w9, w9, #0x7 +; CHECK-NEXT: mov v1.h[1], w10 +; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w9, #17097 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: movk w9, #45590, lsl #16 +; CHECK-NEXT: umull x9, w8, w9 +; CHECK-NEXT: mov w10, #23 +; CHECK-NEXT: lsr x9, x9, #36 +; CHECK-NEXT: umov w11, v0.h[1] +; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: mov w9, #30865 +; CHECK-NEXT: movk w9, #51306, lsl #16 +; CHECK-NEXT: ubfx w10, w11, #1, #15 +; CHECK-NEXT: umull x9, w10, w9 +; CHECK-NEXT: mov w10, #654 +; CHECK-NEXT: lsr x9, x9, #40 +; CHECK-NEXT: msub w9, w9, w10, w11 +; CHECK-NEXT: mov w11, #47143 +; CHECK-NEXT: umov w10, v0.h[3] +; CHECK-NEXT: movk w11, #24749, lsl #16 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: umull x11, w10, w11 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: lsr x11, x11, #43 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: msub w8, w11, w9, w10 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x10, #12109 +; CHECK-NEXT: movk x10, #52170, lsl #16 +; CHECK-NEXT: movk x10, #28749, lsl #32 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: movk x10, #49499, lsl #48 +; CHECK-NEXT: umulh x10, x8, x10 +; CHECK-NEXT: mov w11, #5423 +; CHECK-NEXT: lsr x10, x10, #12 +; CHECK-NEXT: msub x8, x10, x11, x8 +; CHECK-NEXT: mov x10, #21445 +; CHECK-NEXT: movk x10, #1603, lsl #16 +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: movk x10, #15432, lsl #32 +; CHECK-NEXT: movk x10, #25653, lsl #48 +; CHECK-NEXT: lsr x11, x12, #1 +; CHECK-NEXT: umulh x10, x11, x10 +; CHECK-NEXT: mov w11, #654 +; CHECK-NEXT: lsr x10, x10, #7 +; CHECK-NEXT: msub x10, x10, x11, x12 +; CHECK-NEXT: mov x11, #17097 +; CHECK-NEXT: movk x11, #45590, lsl #16 +; CHECK-NEXT: movk x11, #34192, lsl #32 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movk x11, #25644, lsl #48 +; CHECK-NEXT: umulh x11, x9, x11 +; CHECK-NEXT: sub x12, x9, x11 +; CHECK-NEXT: add x11, x11, x12, lsr #1 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: lsr x11, x11, #4 +; CHECK-NEXT: msub x9, x11, x12, x9 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: ret + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} diff --git a/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll new file mode 100644 index 00000000000000..31cc59c10b9c71 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll @@ -0,0 +1,547 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-unknown -O3 -verify-machineinstrs < %s | FileCheck %s + +; Test cases are generated from: +; long long NAME(PARAM a, PARAM b) { +; if (LHS > RHS) +; return b; +; if (LHS < RHS) +; return a;\ +; return a * b; +; } +; Please note funtion name is defined as __. Take ll_a_op_b__1 +; for example. ll is PARAM, a_op_b (i.e., a << b) is LHS, _1 (i.e., -1) is RHS. + +target datalayout = "e-m:e-i64:64-n32:64" + +define i64 @ll_a_op_b__2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b__2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x8, x0, x1 +; CHECK-NEXT: cmn x8, #2 // =2 +; CHECK-NEXT: b.le .LBB0_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, -2 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b__1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b__1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x8, x0, x1 +; CHECK-NEXT: tbnz x8, #63, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %if.end +; CHECK-NEXT: cmn x8, #1 // =1 +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, -1 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b_0(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x8, x0, x1 +; CHECK-NEXT: cmp x8, #0 // =0 +; CHECK-NEXT: b.le .LBB2_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, 0 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b_1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x8, x0, x1 +; CHECK-NEXT: cmp x8, #1 // =1 +; CHECK-NEXT: b.le .LBB3_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, 1 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b_2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl x8, x0, x1 +; CHECK-NEXT: cmp x8, #2 // =2 +; CHECK-NEXT: b.le .LBB4_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, 2 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a__2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a__2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmn x0, #2 // =2 +; CHECK-NEXT: b.le .LBB5_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB5_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i64 %a, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, -2 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a__1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a__1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbnz x0, #63, .LBB6_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_2: // %if.end +; CHECK-NEXT: cmn x0, #1 // =1 +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i64 %a, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, -1 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_0(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp x0, #0 // =0 +; CHECK-NEXT: b.le .LBB7_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i64 %a, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, 0 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp x0, #1 // =1 +; CHECK-NEXT: b.le .LBB8_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i64 %a, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, 1 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp x0, #2 // =2 +; CHECK-NEXT: b.le .LBB9_2 +; CHECK-NEXT: // %bb.1: // %return +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_2: // %if.end +; CHECK-NEXT: csinc x8, x1, xzr, eq +; CHECK-NEXT: mul x0, x8, x0 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i64 %a, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, 2 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @i_a_op_b__2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b__2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: cmn w8, #2 // =2 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, -2 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b__1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b__1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: cmn w8, #1 // =1 +; CHECK-NEXT: csinc w9, w1, wzr, eq +; CHECK-NEXT: mul w9, w9, w0 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: csel w8, w1, w9, ge +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, -1 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b_0(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: cmp w8, #0 // =0 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, 0 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b_1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: cmp w8, #1 // =1 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, 1 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b_2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: lsl w8, w0, w1 +; CHECK-NEXT: cmp w8, #2 // =2 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, 2 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a__2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a__2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmn w0, #2 // =2 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i32 %a, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, -2 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a__1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a__1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmn w0, #1 // =1 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csel w8, w1, w8, ge +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i32 %a, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, -1 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_0(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w0, #0 // =0 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i32 %a, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, 0 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w0, #1 // =1 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i32 %a, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, 1 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w0, #2 // =2 +; CHECK-NEXT: csinc w8, w1, wzr, eq +; CHECK-NEXT: mul w8, w8, w0 +; CHECK-NEXT: csel w8, w1, w8, gt +; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: ret +entry: + %cmp = icmp sgt i32 %a, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, 2 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-extract.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-extract.mir index 4202c4d1348efa..63aabbcbb11355 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-extract.mir @@ -417,9 +417,9 @@ body: | ; CHECK-LABEL: name: extract_s16_build_vector_v2s64_v2s16_v2s16_offset32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY1]](<2 x s16>), 0 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: $vgpr0 = COPY [[COPY2]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<4 x s16>) = G_CONCAT_VECTORS %0, %1 @@ -437,9 +437,11 @@ body: | ; CHECK-LABEL: name: extract_s16_build_vector_v2s64_v2s16_v2s16_offset48 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY1]](<2 x s16>), 16 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY2]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<4 x s16>) = G_CONCAT_VECTORS %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbh-u32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbh-u32.mir new file mode 100644 index 00000000000000..026b6648af8314 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbh-u32.mir @@ -0,0 +1,68 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +--- + +name: ffbh_u32_s32_s_s +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: ffbh_u32_s32_s_s + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK: [[S_FLBIT_I32_B32_:%[0-9]+]]:sreg_32 = S_FLBIT_I32_B32 [[COPY]] + ; CHECK: S_ENDPGM 0, implicit [[S_FLBIT_I32_B32_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_AMDGPU_FFBH_U32 %0 + S_ENDPGM 0, implicit %1 + +... + +--- + +name: ffbh_u32_s32_v_v +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: ffbh_u32_s32_v_v + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK: [[V_FFBH_U32_e64_:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e64 [[COPY]], implicit $exec + ; CHECK: S_ENDPGM 0, implicit [[V_FFBH_U32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_AMDGPU_FFBH_U32 %0 + S_ENDPGM 0, implicit %1 + +... + +--- + +name: ffbh_u32_v_s +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: ffbh_u32_v_s + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; CHECK: [[V_FFBH_U32_e64_:%[0-9]+]]:vgpr_32 = V_FFBH_U32_e64 [[COPY]], implicit $exec + ; CHECK: S_ENDPGM 0, implicit [[V_FFBH_U32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:vgpr(s32) = G_AMDGPU_FFBH_U32 %0 + S_ENDPGM 0, implicit %1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir index 1a90e609f7bd30..e1ce7872e93559 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir @@ -10,51 +10,258 @@ # RUN: FileCheck -check-prefixes=ERR-GFX910,ERR %s < %t # ERR-NOT: remark -# ERR-GFX8: remark: :0:0: cannot select: %3:sgpr(s16) = G_ASHR %2:sgpr, %1:sgpr(s32) (in function: ashr_s16_ss) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:sgpr, %1:vgpr(s32) (in function: ashr_s16_sv) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:sgpr(s32) (in function: ashr_s16_vs) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:vgpr(s32) (in function: ashr_s16_vv) +# ERR: remark: :0:0: cannot select: %4:sgpr(s16) = G_ASHR %2:sgpr, %3:sgpr(s16) (in function: ashr_s16_s16_ss) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:vgpr(s32) (in function: ashr_s16_s32_vv) +# ERR-NEXT: remark: :0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: ashr_s16_vv_zext_to_s64) +# ERR-NEXT: remark: :0:0: cannot select: %3:sgpr(s16) = G_ASHR %2:sgpr, %1:sgpr(s32) (in function: ashr_s16_s32_ss) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:sgpr, %1:vgpr(s32) (in function: ashr_s16_s32_sv) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:sgpr(s32) (in function: ashr_s16_s32_vs) +# ERR-NOT: remark -# ERR-GFX910: remark: :0:0: cannot select: %3:sgpr(s16) = G_ASHR %2:sgpr, %1:sgpr(s32) (in function: ashr_s16_ss) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:sgpr, %1:vgpr(s32) (in function: ashr_s16_sv) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:sgpr(s32) (in function: ashr_s16_vs) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:vgpr(s32) (in function: ashr_s16_vv) +--- +name: ashr_s16_s16_ss +legalized: true +regBankSelected: true -# ERR-NOT: remark +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; GFX8-LABEL: name: ashr_s16_s16_ss + ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX8: [[ASHR:%[0-9]+]]:sgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX8: S_ENDPGM 0, implicit [[ASHR]](s16) + ; GFX9-LABEL: name: ashr_s16_s16_ss + ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[ASHR:%[0-9]+]]:sgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: S_ENDPGM 0, implicit [[ASHR]](s16) + ; GFX10-LABEL: name: ashr_s16_s16_ss + ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10: [[ASHR:%[0-9]+]]:sgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX10: S_ENDPGM 0, implicit [[ASHR]](s16) + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s16) = G_TRUNC %0 + %3:sgpr(s16) = G_TRUNC %1 + %4:sgpr(s16) = G_ASHR %2, %3 + S_ENDPGM 0, implicit %4 +... --- -name: ashr_s16_ss +name: ashr_s16_s16_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; GFX8-LABEL: name: ashr_s16_s16_vs + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX8: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] + ; GFX9-LABEL: name: ashr_s16_s16_vs + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX9: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] + ; GFX10-LABEL: name: ashr_s16_s16_vs + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr(s32) = COPY $sgpr0 + %2:vgpr(s16) = G_TRUNC %0 + %3:sgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_ASHR %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: ashr_s16_s32_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: ashr_s16_s32_vv + ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) + ; GFX8: S_ENDPGM 0, implicit [[ASHR]](s16) + ; GFX9-LABEL: name: ashr_s16_s32_vv + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) + ; GFX9: S_ENDPGM 0, implicit [[ASHR]](s16) + ; GFX10-LABEL: name: ashr_s16_s32_vv + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) + ; GFX10: S_ENDPGM 0, implicit [[ASHR]](s16) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_ASHR %2, %1 + S_ENDPGM 0, implicit %3 +... + +--- +name: ashr_s16_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: ashr_s16_s16_vv + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] + ; GFX9-LABEL: name: ashr_s16_s16_vv + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] + ; GFX10-LABEL: name: ashr_s16_s16_vv + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_ASHR %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: ashr_s16_s16_vv_zext_to_s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: ashr_s16_s16_vv_zext_to_s32 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]] + ; GFX9-LABEL: name: ashr_s16_s16_vv_zext_to_s32 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]] + ; GFX10-LABEL: name: ashr_s16_s16_vv_zext_to_s32 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_ASHR %2, %3 + %5:vgpr(s32) = G_ZEXT %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: ashr_s16_vv_zext_to_s64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: ashr_s16_vv_zext_to_s64 + ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX8: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX8: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16) + ; GFX8: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX9-LABEL: name: ashr_s16_vv_zext_to_s64 + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16) + ; GFX9: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX10-LABEL: name: ashr_s16_vv_zext_to_s64 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX10: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16) + ; GFX10: S_ENDPGM 0, implicit [[ZEXT]](s64) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_ASHR %2, %3 + %5:vgpr(s64) = G_ZEXT %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: ashr_s16_s32_ss legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $sgpr1 - ; GFX6-LABEL: name: ashr_s16_ss - ; GFX6: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; GFX6: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[ASHR:%[0-9]+]]:sgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX7-LABEL: name: ashr_s16_ss - ; GFX7: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; GFX7: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[ASHR:%[0-9]+]]:sgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX8-LABEL: name: ashr_s16_ss + + ; GFX8-LABEL: name: ashr_s16_s32_ss ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[ASHR:%[0-9]+]]:sgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX9-LABEL: name: ashr_s16_ss + ; GFX9-LABEL: name: ashr_s16_s32_ss ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[ASHR:%[0-9]+]]:sgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX10-LABEL: name: ashr_s16_ss + ; GFX10-LABEL: name: ashr_s16_s32_ss ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) @@ -68,38 +275,26 @@ body: | ... --- -name: ashr_s16_sv +name: ashr_s16_s32_sv legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $vgpr0 - ; GFX6-LABEL: name: ashr_s16_sv - ; GFX6: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX7-LABEL: name: ashr_s16_sv - ; GFX7: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX8-LABEL: name: ashr_s16_sv + ; GFX8-LABEL: name: ashr_s16_s32_sv ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX9-LABEL: name: ashr_s16_sv + ; GFX9-LABEL: name: ashr_s16_s32_sv ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX10-LABEL: name: ashr_s16_sv + ; GFX10-LABEL: name: ashr_s16_s32_sv ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) @@ -113,90 +308,67 @@ body: | ... --- -name: ashr_s16_vs +name: ashr_s16_s16_sv legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $vgpr0 - ; GFX6-LABEL: name: ashr_s16_vs - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX7-LABEL: name: ashr_s16_vs - ; GFX7: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX8-LABEL: name: ashr_s16_vs - ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX8: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX8: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX9-LABEL: name: ashr_s16_vs - ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX9: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX9: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX10-LABEL: name: ashr_s16_vs - ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX10: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX10: S_ENDPGM 0, implicit [[ASHR]](s16) - %0:vgpr(s32) = COPY $vgpr0 - %1:sgpr(s32) = COPY $sgpr0 - %2:vgpr(s16) = G_TRUNC %0 - %3:vgpr(s16) = G_ASHR %2, %1 - S_ENDPGM 0, implicit %3 + ; GFX8-LABEL: name: ashr_s16_s16_sv + ; GFX8: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] + ; GFX9-LABEL: name: ashr_s16_s16_sv + ; GFX9: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]] + ; GFX10-LABEL: name: ashr_s16_s16_sv + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_ASHR %2, %3 + S_ENDPGM 0, implicit %4 ... --- -name: ashr_s16_vv +name: ashr_s16_s32_vs legalized: true regBankSelected: true body: | bb.0: - liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: ashr_s16_vv - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX6: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX7-LABEL: name: ashr_s16_vv - ; GFX7: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX7: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX8-LABEL: name: ashr_s16_vv + liveins: $sgpr0, $vgpr0 + ; GFX8-LABEL: name: ashr_s16_s32_vs ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX9-LABEL: name: ashr_s16_vv + ; GFX9-LABEL: name: ashr_s16_s32_vs ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[ASHR]](s16) - ; GFX10-LABEL: name: ashr_s16_vv + ; GFX10-LABEL: name: ashr_s16_s32_vs ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX10: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[COPY1]](s32) ; GFX10: S_ENDPGM 0, implicit [[ASHR]](s16) %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = COPY $vgpr1 + %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s16) = G_TRUNC %0 %3:vgpr(s16) = G_ASHR %2, %1 S_ENDPGM 0, implicit %3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir index b97f9d384aa91e..2acf1aeb5a7c48 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir @@ -5,6 +5,7 @@ name: constant legalized: true regBankSelected: true +tracksRegLiveness: true body: | @@ -25,28 +26,30 @@ body: | ; GCN: %{{[0-9]+}}:sreg_32 = S_MOV_B32 1065353216 %4:sgpr(s32) = G_FCONSTANT float 1.0 + ; GCN: %5:sreg_64_xexec = S_MOV_B64 4607182418800017408 + %5:sgpr(s64) = G_FCONSTANT double 1.0 + ; GCN: [[LO1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 - ; GCN: [[HI1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1072693248 + ; GCN: [[HI1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1076101120 ; GCN: %{{[0-9]+}}:sreg_64_xexec = REG_SEQUENCE [[LO1]], %subreg.sub0, [[HI1]], %subreg.sub1 - %5:sgpr(s64) = G_FCONSTANT double 1.0 + %6:sgpr(s64) = G_FCONSTANT double 10.0 ; GCN: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 1 - %6:vgpr(s32) = G_CONSTANT i32 1 + %7:vgpr(s32) = G_CONSTANT i32 1 ; GCN: [[LO2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0 ; GCN: [[HI2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1 ; GCN: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE [[LO2]], %subreg.sub0, [[HI2]], %subreg.sub1 - %7:vgpr(s64) = G_CONSTANT i64 4294967296 + %8:vgpr(s64) = G_CONSTANT i64 4294967296 ; GCN: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_e32 1065353216 - %8:vgpr(s32) = G_FCONSTANT float 1.0 + %9:vgpr(s32) = G_FCONSTANT float 1.0 ; GCN: [[LO3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0 ; GCN: [[HI3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248 ; GCN: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE [[LO3]], %subreg.sub0, [[HI3]], %subreg.sub1 - %9:vgpr(s64) = G_FCONSTANT double 1.0 + %10:vgpr(s64) = G_FCONSTANT double 1.0 - S_ENDPGM 0, implicit %2, implicit %4, implicit %6, implicit %8, implicit %3, implicit %5, implicit %7, implicit %9 + S_ENDPGM 0, implicit %2, implicit %4, implicit %5, implicit %6, implicit %8, implicit %3, implicit %5, implicit %7, implicit %9, implicit %10 ... - diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir index 93e35ead4d49a0..c120c961741282 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir @@ -1,32 +1,35 @@ -# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + --- -name: insert512 +name: insert_s512_s32 legalized: true regBankSelected: true -# CHECK-LABEL: insert512 -# CHECK: [[BASE:%[0-9]+]]:sreg_512 = IMPLICIT_DEF -# CHECK: [[VAL:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF -# CHECK: [[BASE0:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE]], [[VAL]], %subreg.sub0 -# CHECK: [[BASE1:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE0]], [[VAL]], %subreg.sub1 -# CHECK: [[BASE2:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE1]], [[VAL]], %subreg.sub2 -# CHECK: [[BASE3:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE2]], [[VAL]], %subreg.sub3 -# CHECK: [[BASE4:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE3]], [[VAL]], %subreg.sub4 -# CHECK: [[BASE5:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE4]], [[VAL]], %subreg.sub5 -# CHECK: [[BASE6:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE5]], [[VAL]], %subreg.sub6 -# CHECK: [[BASE7:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE6]], [[VAL]], %subreg.sub7 -# CHECK: [[BASE8:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE7]], [[VAL]], %subreg.sub8 -# CHECK: [[BASE9:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE8]], [[VAL]], %subreg.sub9 -# CHECK: [[BASE10:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE9]], [[VAL]], %subreg.sub10 -# CHECK: [[BASE11:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE10]], [[VAL]], %subreg.sub11 -# CHECK: [[BASE12:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE11]], [[VAL]], %subreg.sub12 -# CHECK: [[BASE13:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE12]], [[VAL]], %subreg.sub13 -# CHECK: [[BASE14:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE13]], [[VAL]], %subreg.sub14 -# CHECK: [[BASE15:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE14]], [[VAL]], %subreg.sub15 - body: | bb.0: + ; CHECK-LABEL: name: insert_s512_s32 + ; CHECK: [[DEF:%[0-9]+]]:sreg_512 = IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[DEF]], [[DEF1]], %subreg.sub0 + ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG]], [[DEF1]], %subreg.sub1 + ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG1]], [[DEF1]], %subreg.sub2 + ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG2]], [[DEF1]], %subreg.sub3 + ; CHECK: [[INSERT_SUBREG4:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG3]], [[DEF1]], %subreg.sub4 + ; CHECK: [[INSERT_SUBREG5:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG4]], [[DEF1]], %subreg.sub5 + ; CHECK: [[INSERT_SUBREG6:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG5]], [[DEF1]], %subreg.sub6 + ; CHECK: [[INSERT_SUBREG7:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG6]], [[DEF1]], %subreg.sub7 + ; CHECK: [[INSERT_SUBREG8:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG7]], [[DEF1]], %subreg.sub8 + ; CHECK: [[INSERT_SUBREG9:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG8]], [[DEF1]], %subreg.sub9 + ; CHECK: [[INSERT_SUBREG10:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG9]], [[DEF1]], %subreg.sub10 + ; CHECK: [[INSERT_SUBREG11:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG10]], [[DEF1]], %subreg.sub11 + ; CHECK: [[INSERT_SUBREG12:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG11]], [[DEF1]], %subreg.sub12 + ; CHECK: [[INSERT_SUBREG13:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG12]], [[DEF1]], %subreg.sub13 + ; CHECK: [[INSERT_SUBREG14:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG13]], [[DEF1]], %subreg.sub14 + ; CHECK: [[INSERT_SUBREG15:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[INSERT_SUBREG14]], [[DEF1]], %subreg.sub15 + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[INSERT_SUBREG15]] + ; CHECK: SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %0:sgpr(s512) = G_IMPLICIT_DEF %1:sgpr(s32) = G_IMPLICIT_DEF %2:sgpr(s512) = G_INSERT %0:sgpr, %1:sgpr(s32), 0 @@ -47,3 +50,513 @@ body: | %17:sgpr(s512) = G_INSERT %16:sgpr, %1:sgpr(s32), 480 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %17:sgpr(s512) SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + +--- + +name: insert_v_s64_v_s32_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s64_v_s32_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + ; CHECK-LABEL: name: insert_v_s64_v_s32_32 + ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s64_s_s32_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-LABEL: name: insert_s_s64_s_s32_0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_64_xexec = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:sgpr(s32) = COPY $sgpr2 + %2:sgpr(s64) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s64_s_s32_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-LABEL: name: insert_s_s64_s_s32_32 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr2 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_64_xexec = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:sgpr(s32) = COPY $sgpr2 + %2:sgpr(s64) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s64_v_s32_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $vgpr0 + ; CHECK-LABEL: name: insert_s_s64_v_s32_32 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s64) = COPY $sgpr0_sgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:vgpr(s64) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s64_s_s32_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0 + ; CHECK-LABEL: name: insert_v_s64_s_s32_32 + ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:sgpr(s32) = COPY $sgpr0 + %2:vgpr(s64) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s96_v_s64_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4 + ; CHECK-LABEL: name: insert_v_s96_v_s64_0 + ; CHECK: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s96) = COPY $vgpr0_vgpr1_vgpr2 + %1:vgpr(s64) = COPY $vgpr3_vgpr4 + %2:vgpr(s96) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s96_v_s64_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4 + ; CHECK-LABEL: name: insert_v_s96_v_s64_32 + ; CHECK: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s96) = COPY $vgpr0_vgpr1_vgpr2 + %1:vgpr(s64) = COPY $vgpr3_vgpr4 + %2:vgpr(s96) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s96_s_s64_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2, $sgpr4_sgpr5 + ; CHECK-LABEL: name: insert_s_s96_s_s64_0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_96_with_sub0_sub1 = COPY $sgpr0_sgpr1_sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s96) = COPY $sgpr0_sgpr1_sgpr2 + %1:sgpr(s64) = COPY $sgpr4_sgpr5 + %2:sgpr(s96) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s96_s_s64_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2, $sgpr4_sgpr5 + ; CHECK-LABEL: name: insert_s_s96_s_s64_32 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_96_with_sub1_sub2 = COPY $sgpr0_sgpr1_sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_96 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s96) = COPY $sgpr0_sgpr1_sgpr2 + %1:sgpr(s64) = COPY $sgpr4_sgpr5 + %2:sgpr(s96) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s128_s_s64_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5 + ; CHECK-LABEL: name: insert_s_s128_s_s64_0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(s64) = COPY $sgpr4_sgpr5 + %2:sgpr(s128) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +# --- + +# name: insert_s_s128_s_s64_32 +# legalized: true +# regBankSelected: true + +# body: | +# bb.0: +# liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5 +# %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 +# %1:sgpr(s64) = COPY $sgpr4_sgpr5 +# %2:sgpr(s128) = G_INSERT %0, %1, 32 +# S_ENDPGM 0, implicit %2 +# ... + +--- + +name: insert_s_s128_s_s64_64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5 + ; CHECK-LABEL: name: insert_s_s128_s_s64_64 + ; CHECK: [[COPY:%[0-9]+]]:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub2_sub3 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(s64) = COPY $sgpr4_sgpr5 + %2:sgpr(s128) = G_INSERT %0, %1, 64 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_v256_v_s64_96 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9 + ; CHECK-LABEL: name: insert_s_v256_v_s64_96 + ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr8_vgpr9 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub3_sub4 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s64) = COPY $vgpr8_vgpr9 + %2:vgpr(s256) = G_INSERT %0, %1, 96 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s256_s_s64_128 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9 + ; CHECK-LABEL: name: insert_s_s256_s_s64_128 + ; CHECK: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub4_sub5 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s256) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + %1:sgpr(s64) = COPY $sgpr4_sgpr5 + %2:sgpr(s256) = G_INSERT %0, %1, 128 + S_ENDPGM 0, implicit %2 +... + +# --- + +# name: insert_s_s256_s_s64_160 +# legalized: true +# regBankSelected: true + +# body: | +# bb.0: +# liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9 +# %0:sgpr(s256) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 +# %1:sgpr(s64) = COPY $sgpr4_sgpr5 +# %2:sgpr(s256) = G_INSERT %0, %1, 160 +# S_ENDPGM 0, implicit %2 +# ... + +--- + +name: insert_s_s128_s_s96_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr6_sgpr7_sgpr8 + ; CHECK-LABEL: name: insert_s_s128_s_s96_0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub0_sub1_sub2 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_96 = COPY $sgpr6_sgpr7_sgpr8 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1_sub2 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 + %2:sgpr(s128) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s128_s_s96_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr6_sgpr7_sgpr8 + ; CHECK-LABEL: name: insert_s_s128_s_s96_32 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_128_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_96 = COPY $sgpr6_sgpr7_sgpr8 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_128 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2_sub3 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s128) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 + %2:sgpr(s128) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s160_s_s96_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, $sgpr6_sgpr7_sgpr8 + ; CHECK-LABEL: name: insert_s_s160_s_s96_0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_160_with_sub0_sub1_sub2 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_96 = COPY $sgpr6_sgpr7_sgpr8 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_160 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1_sub2 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s160) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 + %2:sgpr(s160) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s160_s_s96_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, $sgpr6_sgpr7_sgpr8 + ; CHECK-LABEL: name: insert_s_s160_s_s96_32 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_160_with_sub1_sub2_sub3 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_96 = COPY $sgpr6_sgpr7_sgpr8 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_160 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2_sub3 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s160) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 + %2:sgpr(s160) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s160_s_s96_64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, $sgpr6_sgpr7_sgpr8 + ; CHECK-LABEL: name: insert_s_s160_s_s96_64 + ; CHECK: [[COPY:%[0-9]+]]:sgpr_160_with_sub2_sub3_sub4 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_96 = COPY $sgpr6_sgpr7_sgpr8 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_160 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub2_sub3_sub4 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s160) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + %1:sgpr(s96) = COPY $sgpr6_sgpr7_sgpr8 + %2:sgpr(s160) = G_INSERT %0, %1, 64 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_s_s256_s_s128_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11 + + ; CHECK-LABEL: name: insert_s_s256_s_s128_0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK: [[COPY1:%[0-9]+]]:sreg_128 = COPY $sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:sreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub0_sub1_sub2_sub3 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:sgpr(s256) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + %1:sgpr(s128) = COPY $sgpr8_sgpr9_sgpr10_sgpr11 + %2:sgpr(s256) = G_INSERT %0, %1, 0 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s256_v_s128_32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + + ; CHECK-LABEL: name: insert_v_s256_v_s128_32 + ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub1_sub2_sub3_sub4 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + %2:vgpr(s256) = G_INSERT %0, %1, 32 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s256_v_s128_64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + + ; CHECK-LABEL: name: insert_v_s256_v_s128_64 + ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub2_sub3_sub4_sub5 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + %2:vgpr(s256) = G_INSERT %0, %1, 64 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s256_v_s128_96 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + + ; CHECK-LABEL: name: insert_v_s256_v_s128_96 + ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub3_sub4_sub5_sub6 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + %2:vgpr(s256) = G_INSERT %0, %1, 96 + S_ENDPGM 0, implicit %2 +... + +--- + +name: insert_v_s256_v_s128_128 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11 + + ; CHECK-LABEL: name: insert_v_s256_v_s128_128 + ; CHECK: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:vreg_256 = INSERT_SUBREG [[COPY]], [[COPY1]], %subreg.sub4_sub5_sub6_sub7 + ; CHECK: S_ENDPGM 0, implicit [[INSERT_SUBREG]] + %0:vgpr(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s128) = COPY $vgpr8_vgpr9_vgpr10_vgpr11 + %2:vgpr(s256) = G_INSERT %0, %1, 128 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index 513f5b08c6a23c..65a7fc7f4aa704 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -20,12 +20,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -51,12 +51,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_2 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_2 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5) @@ -82,12 +82,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -208,12 +208,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2047 @@ -243,14 +243,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 @@ -283,12 +283,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2048 @@ -318,14 +318,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 @@ -355,14 +355,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 @@ -392,12 +392,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -427,14 +427,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4096 @@ -464,14 +464,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 @@ -501,14 +501,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 @@ -538,14 +538,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8191 @@ -575,14 +575,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8192 @@ -612,14 +612,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8191 @@ -649,14 +649,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8192 @@ -681,10 +681,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -707,10 +707,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:sgpr(p5) = G_CONSTANT i32 16 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -733,10 +733,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -760,11 +760,11 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -789,10 +789,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_fi - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_fi - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -820,10 +820,10 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -853,13 +853,13 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir index 367c92b5243e43..30cb3f032d76ca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir @@ -190,9 +190,7 @@ body: | # Test a load of an offset from a constant base address # GCN-LABEL: name: constant_address_positive{{$}} -# GCN: %4:sreg_32_xm0 = S_MOV_B32 44 -# GCN: %5:sreg_32_xm0 = S_MOV_B32 0 -# GCN: %0:sreg_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1 +# GCN: %0:sreg_64 = S_MOV_B64 44 # VI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 64, 0, 0 :: (dereferenceable invariant load 4, addrspace 4) # SICI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0, 0 :: (dereferenceable invariant load 4, addrspace 4) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir index 2a2f600c5b7c69..65e9af7719665b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir @@ -10,51 +10,258 @@ # RUN: FileCheck -check-prefixes=ERR-GFX910,ERR %s < %t # ERR-NOT: remark -# ERR-GFX8: remark: :0:0: cannot select: %3:sgpr(s16) = G_LSHR %2:sgpr, %1:sgpr(s32) (in function: lshr_s16_ss) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:sgpr, %1:vgpr(s32) (in function: lshr_s16_sv) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:sgpr(s32) (in function: lshr_s16_vs) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:vgpr(s32) (in function: lshr_s16_vv) +# ERR: remark: :0:0: cannot select: %4:sgpr(s16) = G_LSHR %2:sgpr, %3:sgpr(s16) (in function: lshr_s16_s16_ss) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:vgpr(s32) (in function: lshr_s16_s32_vv) +# ERR-NEXT: remark: :0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: lshr_s16_vv_zext_to_s64) +# ERR-NEXT: remark: :0:0: cannot select: %3:sgpr(s16) = G_LSHR %2:sgpr, %1:sgpr(s32) (in function: lshr_s16_s32_ss) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:sgpr, %1:vgpr(s32) (in function: lshr_s16_s32_sv) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:sgpr(s32) (in function: lshr_s16_s32_vs) +# ERR-NOT: remark -# ERR-GFX910: remark: :0:0: cannot select: %3:sgpr(s16) = G_LSHR %2:sgpr, %1:sgpr(s32) (in function: lshr_s16_ss) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:sgpr, %1:vgpr(s32) (in function: lshr_s16_sv) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:sgpr(s32) (in function: lshr_s16_vs) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:vgpr(s32) (in function: lshr_s16_vv) +--- +name: lshr_s16_s16_ss +legalized: true +regBankSelected: true -# ERR-NOT: remark +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; GFX8-LABEL: name: lshr_s16_s16_ss + ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX8: [[LSHR:%[0-9]+]]:sgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX8: S_ENDPGM 0, implicit [[LSHR]](s16) + ; GFX9-LABEL: name: lshr_s16_s16_ss + ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:sgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: S_ENDPGM 0, implicit [[LSHR]](s16) + ; GFX10-LABEL: name: lshr_s16_s16_ss + ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10: [[LSHR:%[0-9]+]]:sgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX10: S_ENDPGM 0, implicit [[LSHR]](s16) + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s16) = G_TRUNC %0 + %3:sgpr(s16) = G_TRUNC %1 + %4:sgpr(s16) = G_LSHR %2, %3 + S_ENDPGM 0, implicit %4 +... --- -name: lshr_s16_ss +name: lshr_s16_s16_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; GFX8-LABEL: name: lshr_s16_s16_vs + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX8: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] + ; GFX9-LABEL: name: lshr_s16_s16_vs + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX9: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] + ; GFX10-LABEL: name: lshr_s16_s16_vs + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr(s32) = COPY $sgpr0 + %2:vgpr(s16) = G_TRUNC %0 + %3:sgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_LSHR %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: lshr_s16_s32_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: lshr_s16_s32_vv + ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) + ; GFX8: S_ENDPGM 0, implicit [[LSHR]](s16) + ; GFX9-LABEL: name: lshr_s16_s32_vv + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) + ; GFX9: S_ENDPGM 0, implicit [[LSHR]](s16) + ; GFX10-LABEL: name: lshr_s16_s32_vv + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) + ; GFX10: S_ENDPGM 0, implicit [[LSHR]](s16) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_LSHR %2, %1 + S_ENDPGM 0, implicit %3 +... + +--- +name: lshr_s16_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: lshr_s16_s16_vv + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] + ; GFX9-LABEL: name: lshr_s16_s16_vv + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] + ; GFX10-LABEL: name: lshr_s16_s16_vv + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_LSHR %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: lshr_s16_s16_vv_zext_to_s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: lshr_s16_s16_vv_zext_to_s32 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]] + ; GFX9-LABEL: name: lshr_s16_s16_vv_zext_to_s32 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]] + ; GFX10-LABEL: name: lshr_s16_s16_vv_zext_to_s32 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_LSHR %2, %3 + %5:vgpr(s32) = G_ZEXT %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: lshr_s16_vv_zext_to_s64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: lshr_s16_vv_zext_to_s64 + ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX8: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX8: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16) + ; GFX8: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX9-LABEL: name: lshr_s16_vv_zext_to_s64 + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16) + ; GFX9: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX10-LABEL: name: lshr_s16_vv_zext_to_s64 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16) + ; GFX10: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16) + ; GFX10: S_ENDPGM 0, implicit [[ZEXT]](s64) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_LSHR %2, %3 + %5:vgpr(s64) = G_ZEXT %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: lshr_s16_s32_ss legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $sgpr1 - ; GFX6-LABEL: name: lshr_s16_ss - ; GFX6: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; GFX6: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[LSHR:%[0-9]+]]:sgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX7-LABEL: name: lshr_s16_ss - ; GFX7: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; GFX7: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[LSHR:%[0-9]+]]:sgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX8-LABEL: name: lshr_s16_ss + + ; GFX8-LABEL: name: lshr_s16_s32_ss ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[LSHR:%[0-9]+]]:sgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX9-LABEL: name: lshr_s16_ss + ; GFX9-LABEL: name: lshr_s16_s32_ss ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[LSHR:%[0-9]+]]:sgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX10-LABEL: name: lshr_s16_ss + ; GFX10-LABEL: name: lshr_s16_s32_ss ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) @@ -68,38 +275,26 @@ body: | ... --- -name: lshr_s16_sv +name: lshr_s16_s32_sv legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $vgpr0 - ; GFX6-LABEL: name: lshr_s16_sv - ; GFX6: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX7-LABEL: name: lshr_s16_sv - ; GFX7: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX8-LABEL: name: lshr_s16_sv + ; GFX8-LABEL: name: lshr_s16_s32_sv ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX9-LABEL: name: lshr_s16_sv + ; GFX9-LABEL: name: lshr_s16_s32_sv ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX10-LABEL: name: lshr_s16_sv + ; GFX10-LABEL: name: lshr_s16_s32_sv ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) @@ -113,90 +308,67 @@ body: | ... --- -name: lshr_s16_vs +name: lshr_s16_s16_sv legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $vgpr0 - ; GFX6-LABEL: name: lshr_s16_vs - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX7-LABEL: name: lshr_s16_vs - ; GFX7: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX8-LABEL: name: lshr_s16_vs - ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX8: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX8: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX9-LABEL: name: lshr_s16_vs - ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX9: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX9: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX10-LABEL: name: lshr_s16_vs - ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX10: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX10: S_ENDPGM 0, implicit [[LSHR]](s16) - %0:vgpr(s32) = COPY $vgpr0 - %1:sgpr(s32) = COPY $sgpr0 - %2:vgpr(s16) = G_TRUNC %0 - %3:vgpr(s16) = G_LSHR %2, %1 - S_ENDPGM 0, implicit %3 + ; GFX8-LABEL: name: lshr_s16_s16_sv + ; GFX8: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] + ; GFX9-LABEL: name: lshr_s16_s16_sv + ; GFX9: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]] + ; GFX10-LABEL: name: lshr_s16_s16_sv + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_LSHR %2, %3 + S_ENDPGM 0, implicit %4 ... --- -name: lshr_s16_vv +name: lshr_s16_s32_vs legalized: true regBankSelected: true body: | bb.0: - liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: lshr_s16_vv - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX6: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX7-LABEL: name: lshr_s16_vv - ; GFX7: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX7: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX8-LABEL: name: lshr_s16_vv + liveins: $sgpr0, $vgpr0 + ; GFX8-LABEL: name: lshr_s16_s32_vs ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX9-LABEL: name: lshr_s16_vv + ; GFX9-LABEL: name: lshr_s16_s32_vs ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[LSHR]](s16) - ; GFX10-LABEL: name: lshr_s16_vv + ; GFX10-LABEL: name: lshr_s16_s32_vs ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX10: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[COPY1]](s32) ; GFX10: S_ENDPGM 0, implicit [[LSHR]](s16) %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = COPY $vgpr1 + %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s16) = G_TRUNC %0 %3:vgpr(s16) = G_LSHR %2, %1 S_ENDPGM 0, implicit %3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrtoint.mir new file mode 100644 index 00000000000000..53bd5e12a45b73 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrtoint.mir @@ -0,0 +1,101 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +--- + +name: ptrtoint_s_p3_to_s_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: ptrtoint_s_p3_to_s_s32 + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; CHECK: S_ENDPGM 0, implicit [[COPY]] + %0:sgpr(p3) = COPY $sgpr0 + %1:sgpr(s32) = G_PTRTOINT %0 + S_ENDPGM 0, implicit %1 +... + +--- + +name: ptrtoint_s_p5_to_s_s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: ptrtoint_s_p5_to_s_s32 + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; CHECK: S_ENDPGM 0, implicit [[COPY]] + %0:sgpr(p5) = COPY $sgpr0 + %1:sgpr(s32) = G_PTRTOINT %0 + S_ENDPGM 0, implicit %1 +... + +--- + +name: ptrtoint_s_p0_to_s_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: ptrtoint_s_p0_to_s_s64 + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; CHECK: S_ENDPGM 0, implicit [[COPY]] + %0:sgpr(p0) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_PTRTOINT %0 + S_ENDPGM 0, implicit %1 +... + +--- + +name: ptrtoint_s_p1_to_s_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: ptrtoint_s_p1_to_s_s64 + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; CHECK: S_ENDPGM 0, implicit [[COPY]] + %0:sgpr(p1) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_PTRTOINT %0 + S_ENDPGM 0, implicit %1 +... + +--- + +name: ptrtoint_s_p999_to_s_s64 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + + ; CHECK-LABEL: name: ptrtoint_s_p999_to_s_s64 + ; CHECK: liveins: $sgpr0_sgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; CHECK: S_ENDPGM 0, implicit [[COPY]] + %0:sgpr(p999) = COPY $sgpr0_sgpr1 + %1:sgpr(s64) = G_PTRTOINT %0 + S_ENDPGM 0, implicit %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir index d41cdee39040f4..07b4f9a3dcd910 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir @@ -10,51 +10,258 @@ # RUN: FileCheck -check-prefixes=ERR-GFX910,ERR %s < %t # ERR-NOT: remark -# ERR-GFX8: remark: :0:0: cannot select: %3:sgpr(s16) = G_SHL %2:sgpr, %1:sgpr(s32) (in function: shl_s16_ss) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:sgpr, %1:vgpr(s32) (in function: shl_s16_sv) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:sgpr(s32) (in function: shl_s16_vs) -# ERR-GFX8-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:vgpr(s32) (in function: shl_s16_vv) +# ERR: remark: :0:0: cannot select: %4:sgpr(s16) = G_SHL %2:sgpr, %3:sgpr(s16) (in function: shl_s16_s16_ss) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:vgpr(s32) (in function: shl_s16_s32_vv) +# ERR-NEXT: remark: :0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: shl_s16_vv_zext_to_s64) +# ERR-NEXT: remark: :0:0: cannot select: %3:sgpr(s16) = G_SHL %2:sgpr, %1:sgpr(s32) (in function: shl_s16_s32_ss) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:sgpr, %1:vgpr(s32) (in function: shl_s16_s32_sv) +# ERR-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:sgpr(s32) (in function: shl_s16_s32_vs) +# ERR-NOT: remark -# ERR-GFX910: remark: :0:0: cannot select: %3:sgpr(s16) = G_SHL %2:sgpr, %1:sgpr(s32) (in function: shl_s16_ss) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:sgpr, %1:vgpr(s32) (in function: shl_s16_sv) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:sgpr(s32) (in function: shl_s16_vs) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:vgpr(s32) (in function: shl_s16_vv) +--- +name: shl_s16_s16_ss +legalized: true +regBankSelected: true -# ERR-NOT: remark +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; GFX8-LABEL: name: shl_s16_s16_ss + ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX8: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX8: S_ENDPGM 0, implicit [[SHL]](s16) + ; GFX9-LABEL: name: shl_s16_s16_ss + ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: S_ENDPGM 0, implicit [[SHL]](s16) + ; GFX10-LABEL: name: shl_s16_s16_ss + ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX10: S_ENDPGM 0, implicit [[SHL]](s16) + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s16) = G_TRUNC %0 + %3:sgpr(s16) = G_TRUNC %1 + %4:sgpr(s16) = G_SHL %2, %3 + S_ENDPGM 0, implicit %4 +... --- -name: shl_s16_ss +name: shl_s16_s16_vs +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; GFX8-LABEL: name: shl_s16_s16_vs + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX8: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX9-LABEL: name: shl_s16_s16_vs + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX9: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX10-LABEL: name: shl_s16_s16_vs + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr(s32) = COPY $sgpr0 + %2:vgpr(s16) = G_TRUNC %0 + %3:sgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_SHL %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: shl_s16_s32_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: shl_s16_s32_vv + ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) + ; GFX8: S_ENDPGM 0, implicit [[SHL]](s16) + ; GFX9-LABEL: name: shl_s16_s32_vv + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) + ; GFX9: S_ENDPGM 0, implicit [[SHL]](s16) + ; GFX10-LABEL: name: shl_s16_s32_vv + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) + ; GFX10: S_ENDPGM 0, implicit [[SHL]](s16) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_SHL %2, %1 + S_ENDPGM 0, implicit %3 +... + +--- +name: shl_s16_s16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: shl_s16_s16_vv + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX9-LABEL: name: shl_s16_s16_vv + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX10-LABEL: name: shl_s16_s16_vv + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_SHL %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: shl_s16_s16_vv_zext_to_s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: shl_s16_s16_vv_zext_to_s32 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]] + ; GFX9-LABEL: name: shl_s16_s16_vv_zext_to_s32 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]] + ; GFX10-LABEL: name: shl_s16_s16_vv_zext_to_s32 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_SHL %2, %3 + %5:vgpr(s32) = G_ZEXT %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: shl_s16_vv_zext_to_s64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: shl_s16_vv_zext_to_s64 + ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX8: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX8: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX8: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16) + ; GFX8: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX9-LABEL: name: shl_s16_vv_zext_to_s64 + ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16) + ; GFX9: S_ENDPGM 0, implicit [[ZEXT]](s64) + ; GFX10-LABEL: name: shl_s16_vv_zext_to_s64 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) + ; GFX10: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX10: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16) + ; GFX10: S_ENDPGM 0, implicit [[ZEXT]](s64) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_SHL %2, %3 + %5:vgpr(s64) = G_ZEXT %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: shl_s16_s32_ss legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $sgpr1 - ; GFX6-LABEL: name: shl_s16_ss - ; GFX6: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; GFX6: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX7-LABEL: name: shl_s16_ss - ; GFX7: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; GFX7: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX8-LABEL: name: shl_s16_ss + + ; GFX8-LABEL: name: shl_s16_s32_ss ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX9-LABEL: name: shl_s16_ss + ; GFX9-LABEL: name: shl_s16_s32_ss ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX10-LABEL: name: shl_s16_ss + ; GFX10-LABEL: name: shl_s16_s32_ss ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) @@ -68,38 +275,26 @@ body: | ... --- -name: shl_s16_sv +name: shl_s16_s32_sv legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $vgpr0 - ; GFX6-LABEL: name: shl_s16_sv - ; GFX6: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX7-LABEL: name: shl_s16_sv - ; GFX7: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX8-LABEL: name: shl_s16_sv + ; GFX8-LABEL: name: shl_s16_s32_sv ; GFX8: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX8: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX9-LABEL: name: shl_s16_sv + ; GFX9-LABEL: name: shl_s16_s32_sv ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX10-LABEL: name: shl_s16_sv + ; GFX10-LABEL: name: shl_s16_s32_sv ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32) @@ -113,90 +308,67 @@ body: | ... --- -name: shl_s16_vs +name: shl_s16_s16_sv legalized: true regBankSelected: true body: | bb.0: liveins: $sgpr0, $vgpr0 - ; GFX6-LABEL: name: shl_s16_vs - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX6: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX7-LABEL: name: shl_s16_vs - ; GFX7: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX7: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX8-LABEL: name: shl_s16_vs - ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX8: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX8: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX9-LABEL: name: shl_s16_vs - ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX9: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX9: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX10-LABEL: name: shl_s16_vs - ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX10: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX10: S_ENDPGM 0, implicit [[SHL]](s16) - %0:vgpr(s32) = COPY $vgpr0 - %1:sgpr(s32) = COPY $sgpr0 - %2:vgpr(s16) = G_TRUNC %0 - %3:vgpr(s16) = G_SHL %2, %1 - S_ENDPGM 0, implicit %3 + ; GFX8-LABEL: name: shl_s16_s16_sv + ; GFX8: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX9-LABEL: name: shl_s16_s16_sv + ; GFX9: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]] + ; GFX10-LABEL: name: shl_s16_s16_sv + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec + ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:vgpr(s32) = COPY $vgpr0 + %2:sgpr(s16) = G_TRUNC %0 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_SHL %2, %3 + S_ENDPGM 0, implicit %4 ... --- -name: shl_s16_vv +name: shl_s16_s32_vs legalized: true regBankSelected: true body: | bb.0: - liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: shl_s16_vv - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX6: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX6: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX6: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX7-LABEL: name: shl_s16_vv - ; GFX7: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX7: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) - ; GFX7: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) - ; GFX7: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX8-LABEL: name: shl_s16_vv + liveins: $sgpr0, $vgpr0 + ; GFX8-LABEL: name: shl_s16_s32_vs ; GFX8: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX8: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX8: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX8: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX8: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX8: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX9-LABEL: name: shl_s16_vv + ; GFX9-LABEL: name: shl_s16_s32_vs ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX9: S_ENDPGM 0, implicit [[SHL]](s16) - ; GFX10-LABEL: name: shl_s16_vv + ; GFX10-LABEL: name: shl_s16_s32_vs ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32) ; GFX10: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32) ; GFX10: S_ENDPGM 0, implicit [[SHL]](s16) %0:vgpr(s32) = COPY $vgpr0 - %1:vgpr(s32) = COPY $vgpr1 + %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s16) = G_TRUNC %0 %3:vgpr(s16) = G_SHL %2, %1 S_ENDPGM 0, implicit %3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir index 2d8634025c96e1..ee5ff53ca67694 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -21,12 +21,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -52,12 +52,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 5) @@ -83,12 +83,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 5) @@ -114,12 +114,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -145,12 +145,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -176,12 +176,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -209,10 +209,10 @@ body: | ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_fi_offset_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_GEP %0, %1 @@ -239,10 +239,10 @@ body: | ; GFX6-LABEL: name: store_private_s32_to_1_constant_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_constant_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -268,11 +268,11 @@ body: | ; GFX6-LABEL: name: store_private_s32_to_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index 3bbfed7781abfd..67c8079b7d6b4b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck %s ; TODO: Replace with existing DAG tests @@ -10,9 +12,9 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: s_add_u32 s2, 4, 4 -; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: ds_read_b32 v2, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir index 9d1c7fb1011c83..a27040cebc4aec 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir @@ -259,9 +259,9 @@ body: | ; VI: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; VI: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) - ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 + ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY2]], [[C2]](s64) - ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 68, addrspace 4) + ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 64, align 64, addrspace 4) ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](p3), [[C]] ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3) ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) @@ -271,7 +271,7 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; GFX9: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 + ; GFX9: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C2]](s32) ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[C]] @@ -465,16 +465,16 @@ body: | ; VI: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; VI: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; VI: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) - ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 + ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY2]], [[C2]](s64) - ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 68, addrspace 4) + ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 64, align 64, addrspace 4) ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] ; VI: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; VI: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) ; VI: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] ; VI: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY3]], [[C2]](s64) - ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 68, addrspace 4) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (dereferenceable invariant load 4 from `i8 addrspace(4)* undef` + 64, align 64, addrspace 4) ; VI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] ; VI: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) ; VI: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32) @@ -486,14 +486,14 @@ body: | ; GFX9: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) ; GFX9: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; GFX9: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 - ; GFX9: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 + ; GFX9: [[S_GETREG_B32_:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_]], [[C2]](s32) ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] ; GFX9: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; GFX9: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[SHL]](s32) ; GFX9: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] - ; GFX9: [[S_GETREG_B32_1:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 30735 + ; GFX9: [[S_GETREG_B32_1:%[0-9]+]]:sreg_32(s32) = S_GETREG_B32 31759 ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[S_GETREG_B32_1]], [[C2]](s32) ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] ; GFX9: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir index 5ea807166fe53d..e4cc48d54dd66d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -522,3 +522,169 @@ body: | %3:_(<4 x s32>) = G_ANYEXT %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 ... + +--- +name: test_and_v8s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_and_v8s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[DEF]](<8 x s32>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[DEF1]](<8 x s32>) + ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>) + ; CHECK: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[TRUNC1]](<8 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[ANYEXT3]] + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8) + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[ANYEXT5]] + ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8) + ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[ANYEXT7]] + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT8]], [[ANYEXT9]] + ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT10]], [[ANYEXT11]] + ; CHECK: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8) + ; CHECK: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT12]], [[ANYEXT13]] + ; CHECK: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8) + ; CHECK: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT14]], [[ANYEXT15]] + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND4]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND5]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[AND7]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + %0:_(<8 x s8>) = G_IMPLICIT_DEF + %1:_(<8 x s8>) = G_IMPLICIT_DEF + %2:_(<8 x s8>) = G_AND %0, %1 + %3:_(<8 x s32>) = G_ANYEXT %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %3 +... + +--- +name: test_and_v16s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_and_v16s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[DEF]](<16 x s32>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[DEF1]](<16 x s32>) + ; CHECK: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<16 x s8>) + ; CHECK: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[TRUNC1]](<16 x s8>) + ; CHECK: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV]](<8 x s8>) + ; CHECK: [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8), [[UV16:%[0-9]+]]:_(s8), [[UV17:%[0-9]+]]:_(s8), [[UV18:%[0-9]+]]:_(s8), [[UV19:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV2]](<8 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[ANYEXT3]] + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8) + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[ANYEXT5]] + ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8) + ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[ANYEXT7]] + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8) + ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s8) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT8]], [[ANYEXT9]] + ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8) + ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s8) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT10]], [[ANYEXT11]] + ; CHECK: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8) + ; CHECK: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s8) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT12]], [[ANYEXT13]] + ; CHECK: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8) + ; CHECK: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s8) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT14]], [[ANYEXT15]] + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND4]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND5]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[AND7]](s32) + ; CHECK: [[UV20:%[0-9]+]]:_(s8), [[UV21:%[0-9]+]]:_(s8), [[UV22:%[0-9]+]]:_(s8), [[UV23:%[0-9]+]]:_(s8), [[UV24:%[0-9]+]]:_(s8), [[UV25:%[0-9]+]]:_(s8), [[UV26:%[0-9]+]]:_(s8), [[UV27:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV1]](<8 x s8>) + ; CHECK: [[UV28:%[0-9]+]]:_(s8), [[UV29:%[0-9]+]]:_(s8), [[UV30:%[0-9]+]]:_(s8), [[UV31:%[0-9]+]]:_(s8), [[UV32:%[0-9]+]]:_(s8), [[UV33:%[0-9]+]]:_(s8), [[UV34:%[0-9]+]]:_(s8), [[UV35:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV3]](<8 x s8>) + ; CHECK: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s8) + ; CHECK: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV28]](s8) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT16]], [[ANYEXT17]] + ; CHECK: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s8) + ; CHECK: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV29]](s8) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT18]], [[ANYEXT19]] + ; CHECK: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s8) + ; CHECK: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV30]](s8) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ANYEXT20]], [[ANYEXT21]] + ; CHECK: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s8) + ; CHECK: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s8) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ANYEXT22]], [[ANYEXT23]] + ; CHECK: [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[UV24]](s8) + ; CHECK: [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s8) + ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT24]], [[ANYEXT25]] + ; CHECK: [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[UV25]](s8) + ; CHECK: [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s8) + ; CHECK: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT26]], [[ANYEXT27]] + ; CHECK: [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[UV26]](s8) + ; CHECK: [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s8) + ; CHECK: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT28]], [[ANYEXT29]] + ; CHECK: [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[UV27]](s8) + ; CHECK: [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s8) + ; CHECK: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT30]], [[ANYEXT31]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[AND8]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND9]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AND10]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND11]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[AND12]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[AND13]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[AND14]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[AND15]](s32) + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>) + %0:_(<16 x s8>) = G_IMPLICIT_DEF + %1:_(<16 x s8>) = G_IMPLICIT_DEF + %2:_(<16 x s8>) = G_AND %0, %1 + %3:_(<16 x s32>) = G_ANYEXT %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir new file mode 100644 index 00000000000000..b5fe7334e066a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir @@ -0,0 +1,107 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -O0 -run-pass=legalizer %s -o - | FileCheck %s + +--- +name: test_atomic_cmpxchg_with_success_s32_global +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: test_atomic_cmpxchg_with_success_s32_global + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p1), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic 4, addrspace 1) + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[COPY1]] + ; CHECK: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s32) = COPY $vgpr3 + %3:_(s32), %4:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS %0, %1, %2 :: (load store syncscope("agent-one-as") monotonic monotonic 4, addrspace 1) + S_ENDPGM 0, implicit %3, implicit %4 + +... + +--- +name: test_atomic_cmpxchg_with_success_s32_flat +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: test_atomic_cmpxchg_with_success_s32_flat + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p0), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic 4) + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[COPY1]] + ; CHECK: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + %0:_(p0) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s32) = COPY $vgpr3 + %3:_(s32), %4:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS %0, %1, %2 :: (load store syncscope("agent-one-as") monotonic monotonic 4, addrspace 0) + S_ENDPGM 0, implicit %3, implicit %4 + +... + +--- +name: test_atomic_cmpxchg_with_success_s32_local +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_atomic_cmpxchg_with_success_s32_local + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic 4, addrspace 3) + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[COPY1]] + ; CHECK: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + %0:_(p3) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32), %4:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS %0, %1, %2 :: (load store syncscope("agent-one-as") monotonic monotonic 4, addrspace 3) + S_ENDPGM 0, implicit %3, implicit %4 + +... + +--- +name: test_atomic_cmpxchg_with_success_s64_global +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; CHECK-LABEL: name: test_atomic_cmpxchg_with_success_s64_global + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 + ; CHECK: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p1), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic 8, addrspace 1) + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[COPY1]] + ; CHECK: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = COPY $vgpr4_vgpr5 + %3:_(s64), %4:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS %0, %1, %2 :: (load store syncscope("agent-one-as") monotonic monotonic 8, addrspace 1) + S_ENDPGM 0, implicit %3, implicit %4 + +... + +--- +name: test_atomic_cmpxchg_with_success_s64_local +body: | + bb.0: + liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + + ; CHECK-LABEL: name: test_atomic_cmpxchg_with_success_s64_local + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr1_vgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr3_vgpr4 + ; CHECK: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic 8, addrspace 3) + ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[COPY1]] + ; CHECK: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + %0:_(p3) = COPY $vgpr0 + %1:_(s64) = COPY $vgpr1_vgpr2 + %2:_(s64) = COPY $vgpr3_vgpr4 + %3:_(s64), %4:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS %0, %1, %2 :: (load store syncscope("agent-one-as") monotonic monotonic 8, addrspace 3) + S_ENDPGM 0, implicit %3, implicit %4 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir index 2c3995a49d1c60..a7c62f74c216c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir @@ -121,6 +121,36 @@ body: | $vgpr0_vgpr1 = COPY %1 ... +--- +name: test_bitcast_v2s64_to_v8s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_bitcast_v2s64_to_v8s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>) + %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<8 x s16>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 +... + +--- +name: test_bitcast_v8s16_to_v2s64 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_bitcast_v8s16_to_v2s64 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[COPY]](<8 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x s64>) + %0:_(<8 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 +... + --- name: test_bitcast_p0_to_p1 body: | @@ -180,3 +210,75 @@ body: | %1:_(p999) = G_BITCAST %0 $vgpr0_vgpr1 = COPY %1 ... + +--- +name: test_bitcast_v4s64_to_v8s32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_bitcast_v4s64_to_v8s32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s32>) = G_BITCAST [[COPY]](<4 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<8 x s32>) + %0:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<8 x s32>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 +... + +--- +name: test_bitcast_v8s32_to_v4s64 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_bitcast_v8s32_to_v4s64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s64>) = G_BITCAST [[COPY]](<8 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x s64>) + %0:_(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<4 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 +... + +--- +name: test_bitcast_v8s64_to_v16s32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + + ; CHECK-LABEL: name: test_bitcast_v8s64_to_v16s32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<16 x s32>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(<16 x s32>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1 +... + +--- +name: test_bitcast_v16s32_to_v8s64 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + + ; CHECK-LABEL: name: test_bitcast_v16s32_to_v8s64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s64>) = G_BITCAST [[COPY]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) + %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(<8 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir index c71010238750cf..bb1592ba5ef0db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir @@ -231,9 +231,9 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_0_v2i16_i32 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[DEF]](<2 x s16>), 0 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[DEF]](<2 x s16>) + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: $vgpr0 = COPY [[COPY]](s32) %0:_(<2 x s16>) = G_IMPLICIT_DEF %1:_(s32) = G_CONSTANT i32 0 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -417,9 +417,9 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx0_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<2 x s16>), 0 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -436,9 +436,11 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx1_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<2 x s16>), 16 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 1 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir index 4e7ddafbbe308f..b3a14ce947d972 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract.mir @@ -56,9 +56,8 @@ body: | ; CHECK-LABEL: name: test_extract_s16_s31_offset0 ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[TRUNC]](s32), 0 - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s31) = G_TRUNC %0 %2:_(s16) = G_EXTRACT %1, 0 @@ -929,3 +928,181 @@ body: | %1:_(<2 x s16>) = G_EXTRACT %0, 0 $vgpr0 = COPY %1 ... + +--- +name: extract_s16_v2s16_offset0 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_v2s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 0 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_v2s16_offset1 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_v2s16_offset1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 1 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_v2s16_offset8 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_v2s16_offset8 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 8 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_v2s16_offset16 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_v2s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 16 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_s32_offset0 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_s32_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 0 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_s32_offset1 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_s32_offset1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 1 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_s32_offset8 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_s32_offset8 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 8 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_s32_offset16 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_s32_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 16 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_p3_offset0 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_p3_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](p3), 0 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(p3) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 0 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... + +--- +name: extract_s16_p3_offset1 +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_s16_p3_offset1 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](p3), 1 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(p3) = COPY $vgpr0 + %1:_(s16) = G_EXTRACT %0, 1 + %2:_(s32) = G_ANYEXT %1 + $vgpr0 = COPY %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir index b36737de837d85..506dcfe0e8a446 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir @@ -33,6 +33,24 @@ body: | %2:_(s64) = G_INSERT %0, %1, 32 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_insert_s64_s32_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s32_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[COPY1]](s32), 16 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s64) = G_INSERT %0, %1, 16 + $vgpr0_vgpr1 = COPY %2 +... + --- name: test_insert_s96_s32_offset0 body: | @@ -305,6 +323,83 @@ body: | %2:_(s128) = G_INSERT %0, %1, 64 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 ... + +--- +name: test_insert_s128_s16_offset0 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 0 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 16 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 16 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset32 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset32 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 32 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 32 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset112 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset112 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 112 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 112 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + --- name: test_insert_v2s32_s32_offset0 body: | @@ -667,15 +762,48 @@ body: | ; CHECK-LABEL: name: test_insert_v2s16_s16_offset0 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; CHECK: [[INSERT:%[0-9]+]]:_(<2 x s16>) = G_INSERT [[COPY]], [[TRUNC]](s16), 0 - ; CHECK: $vgpr0 = COPY [[INSERT]](<2 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -65536 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[AND]] + ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %1 %3:_(<2 x s16>) = G_INSERT %0, %2, 0 $vgpr0 = COPY %3 ... + +--- +name: test_insert_v2s16_s16_offset1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_insert_v2s16_s16_offset1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -65535 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] + ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %1 + %3:_(<2 x s16>) = G_INSERT %0, %2, 1 + $vgpr0 = COPY %3 +... --- name: test_insert_v2s16_s16_offset16 body: | @@ -685,9 +813,17 @@ body: | ; CHECK-LABEL: name: test_insert_v2s16_s16_offset16 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; CHECK: [[INSERT:%[0-9]+]]:_(<2 x s16>) = G_INSERT [[COPY]], [[TRUNC]](s16), 16 - ; CHECK: $vgpr0 = COPY [[INSERT]](<2 x s16>) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] + ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %1 @@ -1079,3 +1215,177 @@ body: | %2:_(<4 x s16>) = G_INSERT %0, %1, 32 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_insert_s64_s16_offset0 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 0 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 0 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 16 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 16 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset32 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 32 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 32 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset48 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset48 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 48 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 48 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s32_s16_offset0 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_insert_s32_s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -65536 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[AND]] + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %1 + %3:_(s32) = G_INSERT %1, %2, 0 + $vgpr0 = COPY %3 +... + +--- +name: test_insert_s32_s16_offset1 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_insert_s32_s16_offset1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -65535 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %1 + %3:_(s32) = G_INSERT %1, %2, 1 + $vgpr0 = COPY %3 +... + +--- +name: test_insert_s32_s16_offset8 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_insert_s32_s16_offset8 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -65281 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %1 + %3:_(s32) = G_INSERT %1, %2, 8 + $vgpr0 = COPY %3 +... + +--- +name: test_insert_s32_s16_offset16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_insert_s32_s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]] + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[BITCAST]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %1 + %3:_(s32) = G_INSERT %1, %2, 16 + $vgpr0 = COPY %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir index 2795e75da98460..bcdbecd28a17a1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant-32bit.mir @@ -39,8 +39,12 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV1]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C7]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: $vgpr0 = COPY [[OR2]](s32) %0:_(p6) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 1, addrspace 6) $vgpr0 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir index d1c62196c7d6ba..5bf60e2ce82c92 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir @@ -383,53 +383,78 @@ body: | ; CI-LABEL: name: test_load_constant_s32_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: $vgpr0 = COPY [[OR]](s32) ; VI-LABEL: name: test_load_constant_s32_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: $vgpr0 = COPY [[OR]](s32) ; GFX9-LABEL: name: test_load_constant_s32_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: $vgpr0 = COPY [[OR]](s32) ; CI-MESA-LABEL: name: test_load_constant_s32_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](s32) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: $vgpr0 = COPY [[OR]](s32) ; GFX9-MESA-LABEL: name: test_load_constant_s32_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](s32) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: $vgpr0 = COPY [[OR]](s32) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (load 4, align 2, addrspace 4) $vgpr0 = COPY %1 @@ -471,8 +496,12 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: $vgpr0 = COPY [[OR2]](s32) ; VI-LABEL: name: test_load_constant_s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -499,8 +528,12 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: $vgpr0 = COPY [[OR2]](s32) ; GFX9-LABEL: name: test_load_constant_s32_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -527,8 +560,12 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: $vgpr0 = COPY [[OR2]](s32) ; CI-MESA-LABEL: name: test_load_constant_s32_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -559,8 +596,12 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](s32) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: $vgpr0 = COPY [[OR2]](s32) ; GFX9-MESA-LABEL: name: test_load_constant_s32_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -587,8 +628,12 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](s32) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: $vgpr0 = COPY [[OR2]](s32) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (load 4, align 1, addrspace 4) $vgpr0 = COPY %1 @@ -712,92 +757,142 @@ body: | ; CI-LABEL: name: test_load_constant_s64_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_constant_s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_constant_s64_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-MESA-LABEL: name: test_load_constant_s64_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-MESA-LABEL: name: test_load_constant_s64_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(s64) = G_LOAD %0 :: (load 8, align 2, addrspace 4) @@ -868,7 +963,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_constant_s64_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -920,7 +1024,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_constant_s64_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -972,7 +1085,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-MESA-LABEL: name: test_load_constant_s64_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -1032,7 +1154,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-MESA-LABEL: name: test_load_constant_s64_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -1084,7 +1215,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(s64) = G_LOAD %0 :: (load 8, align 1, addrspace 4) @@ -1193,132 +1333,202 @@ body: | ; CI-LABEL: name: test_load_constant_s96_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_constant_s96_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; VI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_load_constant_s96_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-MESA-LABEL: name: test_load_constant_s96_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-MESA-LABEL: name: test_load_constant_s96_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(s96) = G_LOAD %0 :: (load 12, align 2, addrspace 4) @@ -1417,7 +1627,20 @@ body: | ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_constant_s96_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -1493,7 +1716,20 @@ body: | ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_load_constant_s96_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -1569,7 +1805,20 @@ body: | ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-MESA-LABEL: name: test_load_constant_s96_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -1657,7 +1906,20 @@ body: | ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-MESA-LABEL: name: test_load_constant_s96_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -1733,7 +1995,20 @@ body: | ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 4) @@ -2007,7 +2282,24 @@ body: | ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C16]](s32) ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C18]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C18]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C18]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C18]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_constant_s128_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2107,7 +2399,24 @@ body: | ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-LABEL: name: test_load_constant_s128_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2207,7 +2516,24 @@ body: | ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; GFX9: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; CI-MESA-LABEL: name: test_load_constant_s128_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2323,7 +2649,24 @@ body: | ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C16]](s32) ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C18]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C18]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C18]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C18]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-MESA-LABEL: name: test_load_constant_s128_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2423,7 +2766,24 @@ body: | ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 4) @@ -2587,7 +2947,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; VI-LABEL: name: test_load_constant_p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2639,7 +3008,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-LABEL: name: test_load_constant_p1_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2691,7 +3069,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-MESA-LABEL: name: test_load_constant_p1_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2751,7 +3138,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-MESA-LABEL: name: test_load_constant_p1_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -2803,7 +3199,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(p1) = G_LOAD %0 :: (load 8, align 1, addrspace 4) @@ -2912,92 +3317,142 @@ body: | ; CI-LABEL: name: test_load_constant_p4_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; VI-LABEL: name: test_load_constant_p4_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-LABEL: name: test_load_constant_p4_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p4) ; CI-MESA-LABEL: name: test_load_constant_p4_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-MESA-LABEL: name: test_load_constant_p4_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(p4) = G_LOAD %0 :: (load 8, align 2, addrspace 4) @@ -3068,7 +3523,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; VI-LABEL: name: test_load_constant_p4_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -3120,7 +3584,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-LABEL: name: test_load_constant_p4_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -3172,7 +3645,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p4) ; CI-MESA-LABEL: name: test_load_constant_p4_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -3232,7 +3714,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-MESA-LABEL: name: test_load_constant_p4_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -3284,7 +3775,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(p4) = G_LOAD %0 :: (load 8, align 1, addrspace 4) @@ -3331,53 +3831,83 @@ body: | ; CI-LABEL: name: test_load_constant_p5_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_constant_p5_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_constant_p5_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-MESA-LABEL: name: test_load_constant_p5_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](p5) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-MESA-LABEL: name: test_load_constant_p5_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](p5) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX9-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(p5) = G_LOAD %0 :: (load 4, align 2, addrspace 4) $vgpr0 = COPY %1 @@ -3419,8 +3949,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_constant_p5_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -3447,8 +3982,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_constant_p5_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -3475,8 +4015,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-MESA-LABEL: name: test_load_constant_p5_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -3507,8 +4052,13 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](p5) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-MESA-LABEL: name: test_load_constant_p5_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 1, addrspace 4) @@ -3535,8 +4085,13 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](p5) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX9-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(p5) = G_LOAD %0 :: (load 4, align 1, addrspace 4) $vgpr0 = COPY %1 @@ -6390,166 +6945,256 @@ body: | ; CI-LABEL: name: test_load_constant_v2s64_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; CI: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C1]](s64) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C2]](s64) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 4) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_constant_v2s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C1]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C2]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 4) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_constant_v2s64_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C1]](s64) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C2]](s64) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 4) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-MESA-LABEL: name: test_load_constant_v2s64_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; CI-MESA: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C1]](s64) ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C2]](s64) ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 4) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-MESA-LABEL: name: test_load_constant_v2s64_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) - ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 4) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C1]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C2]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 4) - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p4) = COPY $vgpr0_vgpr1 @@ -6621,9 +7266,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -6644,34 +7298,42 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_constant_v2s64_align1 @@ -6724,9 +7386,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; VI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -6746,27 +7417,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_constant_v2s64_align1 @@ -6819,9 +7498,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -6841,27 +7529,35 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-MESA-LABEL: name: test_load_constant_v2s64_align1 @@ -6922,9 +7618,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -6945,34 +7650,42 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-MESA-LABEL: name: test_load_constant_v2s64_align1 @@ -7025,9 +7738,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -7047,27 +7769,35 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p4) = COPY $vgpr0_vgpr1 @@ -7225,9 +7955,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -7248,36 +7987,44 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) ; CI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; CI: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; CI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -7298,34 +8045,42 @@ body: | ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; CI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; CI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7380,9 +8135,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; VI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -7402,29 +8166,37 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; VI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; VI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; VI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; VI: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; VI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -7444,27 +8216,35 @@ body: | ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; VI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7519,9 +8299,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -7541,29 +8330,37 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; GFX9: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; GFX9: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -7583,27 +8380,35 @@ body: | ; GFX9: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7666,9 +8471,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -7689,36 +8503,44 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) ; CI-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; CI-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -7739,34 +8561,42 @@ body: | ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7821,9 +8651,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -7843,29 +8682,37 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; GFX9-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; GFX9-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -7885,27 +8732,35 @@ body: | ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -8043,9 +8898,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -8066,36 +8930,44 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) ; CI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; CI: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; CI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -8116,36 +8988,44 @@ body: | ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; CI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; CI: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) + ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; CI: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; CI: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C13]](s64) ; CI: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p4) :: (load 1, addrspace 4) ; CI: [[GEP24:%[0-9]+]]:_(p4) = G_GEP [[GEP23]], [[C]](s64) ; CI: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p4) :: (load 1, addrspace 4) @@ -8166,34 +9046,42 @@ body: | ; CI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD25]](s32) ; CI: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C9]] - ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) - ; CI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) - ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] + ; CI: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) + ; CI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL18]](s32) + ; CI: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] ; CI: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; CI: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; CI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LOAD27]](s32) ; CI: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C9]] - ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) - ; CI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) - ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] + ; CI: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) + ; CI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL19]](s32) + ; CI: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] ; CI: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; CI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; CI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LOAD29]](s32) ; CI: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C9]] - ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) - ; CI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) - ; CI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] + ; CI: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) + ; CI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL20]](s32) + ; CI: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] ; CI: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; CI: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; CI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LOAD31]](s32) ; CI: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C9]] - ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) - ; CI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) - ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] - ; CI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; CI: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) + ; CI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL21]](s32) + ; CI: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] + ; CI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; CI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; CI: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C10]](s32) + ; CI: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; CI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; CI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; CI: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C10]](s32) + ; CI: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; CI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; VI-LABEL: name: test_load_constant_v4s64_align1 @@ -8246,9 +9134,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; VI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -8268,29 +9165,37 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; VI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; VI: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; VI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; VI: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; VI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -8310,29 +9215,37 @@ body: | ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; VI: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; VI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; VI: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) ; VI: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p4) :: (load 1, addrspace 4) ; VI: [[GEP24:%[0-9]+]]:_(p4) = G_GEP [[GEP23]], [[C]](s64) ; VI: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p4) :: (load 1, addrspace 4) @@ -8352,27 +9265,35 @@ body: | ; VI: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; VI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; VI: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; VI: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; VI: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; VI: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; VI: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; VI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; VI: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; VI: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; VI: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; VI: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; VI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; VI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; VI: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; VI: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; VI: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; VI: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; VI: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; VI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; VI: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; VI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; VI: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; VI: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; VI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; VI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; VI: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; VI: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; VI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; VI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; VI: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; VI: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; VI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; GFX9-LABEL: name: test_load_constant_v4s64_align1 @@ -8425,9 +9346,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -8447,29 +9377,37 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; GFX9: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; GFX9: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -8489,29 +9427,37 @@ body: | ; GFX9: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; GFX9: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; GFX9: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) + ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; GFX9: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; GFX9: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) ; GFX9: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP24:%[0-9]+]]:_(p4) = G_GEP [[GEP23]], [[C]](s64) ; GFX9: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p4) :: (load 1, addrspace 4) @@ -8531,27 +9477,35 @@ body: | ; GFX9: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; GFX9: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; GFX9: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; GFX9: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; GFX9: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; GFX9: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; GFX9: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; GFX9: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; GFX9: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; GFX9: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; GFX9: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; GFX9: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; GFX9: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; GFX9: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; GFX9: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; GFX9: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; GFX9: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; GFX9: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; GFX9: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; GFX9: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; GFX9: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; GFX9: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; GFX9: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; GFX9: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; GFX9: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; GFX9: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; GFX9: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; GFX9: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; GFX9: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; GFX9: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; GFX9: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; GFX9: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; GFX9: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; GFX9: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; GFX9: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; CI-MESA-LABEL: name: test_load_constant_v4s64_align1 @@ -8612,9 +9566,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -8635,36 +9598,44 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) ; CI-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; CI-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -8685,36 +9656,44 @@ body: | ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; CI-MESA: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; CI-MESA: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; CI-MESA: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C13]](s64) ; CI-MESA: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP24:%[0-9]+]]:_(p4) = G_GEP [[GEP23]], [[C]](s64) ; CI-MESA: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p4) :: (load 1, addrspace 4) @@ -8735,34 +9714,42 @@ body: | ; CI-MESA: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD25]](s32) ; CI-MESA: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C9]] - ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) - ; CI-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) - ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] + ; CI-MESA: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) + ; CI-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL18]](s32) + ; CI-MESA: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] ; CI-MESA: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; CI-MESA: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; CI-MESA: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LOAD27]](s32) ; CI-MESA: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C9]] - ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) - ; CI-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) - ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] + ; CI-MESA: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) + ; CI-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL19]](s32) + ; CI-MESA: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] ; CI-MESA: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; CI-MESA: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; CI-MESA: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LOAD29]](s32) ; CI-MESA: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C9]] - ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) - ; CI-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) - ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] + ; CI-MESA: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) + ; CI-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL20]](s32) + ; CI-MESA: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] ; CI-MESA: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; CI-MESA: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; CI-MESA: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LOAD31]](s32) ; CI-MESA: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C9]] - ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) - ; CI-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) - ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] - ; CI-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; CI-MESA: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) + ; CI-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL21]](s32) + ; CI-MESA: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] + ; CI-MESA: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; CI-MESA: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; CI-MESA: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C10]](s32) + ; CI-MESA: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; CI-MESA: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; CI-MESA: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; CI-MESA: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C10]](s32) + ; CI-MESA: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; CI-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; GFX9-MESA-LABEL: name: test_load_constant_v4s64_align1 @@ -8815,9 +9802,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -8837,29 +9833,37 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; GFX9-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP16:%[0-9]+]]:_(p4) = G_GEP [[GEP15]], [[C]](s64) ; GFX9-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p4) :: (load 1, addrspace 4) @@ -8879,29 +9883,37 @@ body: | ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; GFX9-MESA: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; GFX9-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; GFX9-MESA: [[GEP23:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C12]](s64) ; GFX9-MESA: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP24:%[0-9]+]]:_(p4) = G_GEP [[GEP23]], [[C]](s64) ; GFX9-MESA: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p4) :: (load 1, addrspace 4) @@ -8921,27 +9933,35 @@ body: | ; GFX9-MESA: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; GFX9-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; GFX9-MESA: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; GFX9-MESA: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; GFX9-MESA: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; GFX9-MESA: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; GFX9-MESA: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; GFX9-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; GFX9-MESA: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; GFX9-MESA: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; GFX9-MESA: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; GFX9-MESA: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; GFX9-MESA: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; GFX9-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; GFX9-MESA: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; GFX9-MESA: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; GFX9-MESA: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; GFX9-MESA: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; GFX9-MESA: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; GFX9-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; GFX9-MESA: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; GFX9-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; GFX9-MESA: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; GFX9-MESA: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; GFX9-MESA: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; GFX9-MESA: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; GFX9-MESA: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; GFX9-MESA: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; GFX9-MESA: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; GFX9-MESA: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; GFX9-MESA: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; GFX9-MESA: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; GFX9-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) %0:_(p4) = COPY $vgpr0_vgpr1 @@ -9137,9 +10157,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -9160,34 +10189,42 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; VI-LABEL: name: test_load_constant_v2p1_align1 @@ -9240,9 +10277,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; VI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -9262,27 +10308,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; GFX9-LABEL: name: test_load_constant_v2p1_align1 @@ -9335,9 +10389,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -9357,27 +10420,35 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_constant_v2p1_align1 @@ -9438,9 +10509,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -9461,34 +10541,42 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_constant_v2p1_align1 @@ -9541,9 +10629,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 1, addrspace 4) @@ -9563,27 +10660,35 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) %0:_(p4) = COPY $vgpr0_vgpr1 @@ -9689,9 +10794,14 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 4) ; CI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 4) @@ -9704,19 +10814,23 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; VI-LABEL: name: test_load_constant_v2p3_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -9744,9 +10858,14 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 4) ; VI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 4) @@ -9758,16 +10877,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; GFX9-LABEL: name: test_load_constant_v2p3_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -9795,9 +10918,14 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX9: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 4) ; GFX9: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 4) @@ -9809,16 +10937,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; CI-MESA-LABEL: name: test_load_constant_v2p3_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -9850,9 +10982,14 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 4) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 4) @@ -9865,19 +11002,23 @@ body: | ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; CI-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; GFX9-MESA-LABEL: name: test_load_constant_v2p3_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -9905,9 +11046,14 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 4) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 4) @@ -9919,16 +11065,20 @@ body: | ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load 8, align 1, addrspace 4) @@ -10220,9 +11370,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 1) ; CI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 1) @@ -10235,19 +11389,22 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_constant_v2s32_from_4_align1 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -10275,9 +11432,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 1) ; VI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 1) @@ -10289,16 +11450,19 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_extload_constant_v2s32_from_4_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -10326,9 +11490,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 1) ; GFX9: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 1) @@ -10340,16 +11508,19 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-MESA-LABEL: name: test_extload_constant_v2s32_from_4_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -10381,9 +11552,13 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 1) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 1) @@ -10396,19 +11571,22 @@ body: | ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-MESA-LABEL: name: test_extload_constant_v2s32_from_4_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 @@ -10436,9 +11614,13 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C6]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 1, addrspace 1) @@ -10450,16 +11632,19 @@ body: | ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 1, addrspace 1) @@ -10475,97 +11660,137 @@ body: | ; CI-LABEL: name: test_extload_constant_v2s32_from_4_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[GEP1]], [[C]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_constant_v2s32_from_4_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[GEP1]], [[C]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_extload_constant_v2s32_from_4_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[GEP1]], [[C]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-MESA-LABEL: name: test_extload_constant_v2s32_from_4_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[GEP1]], [[C]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-MESA-LABEL: name: test_extload_constant_v2s32_from_4_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[GEP1]], [[C]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 2, addrspace 1) @@ -10842,9 +12067,22 @@ body: | ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; CI: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; CI: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C14]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; CI: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C15]](s64) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p4) :: (load 1, addrspace 1) ; CI: [[GEP12:%[0-9]+]]:_(p4) = G_GEP [[GEP11]], [[C]](s64) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p4) :: (load 1, addrspace 1) @@ -10873,50 +12111,62 @@ body: | ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C13]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C13]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] ; CI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; CI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C13]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; CI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C13]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; CI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C13]] - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; CI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C13]] - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C14]](s32) + ; CI: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; CI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; CI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C14]](s32) + ; CI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C14]](s32) + ; CI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; CI: [[COPY24:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; CI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY24]](s96) @@ -10995,9 +12245,22 @@ body: | ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; VI: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; VI: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C13]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; VI: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C14]](s64) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p4) :: (load 1, addrspace 1) ; VI: [[GEP12:%[0-9]+]]:_(p4) = G_GEP [[GEP11]], [[C]](s64) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p4) :: (load 1, addrspace 1) @@ -11025,39 +12288,51 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C11]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C11]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C11]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] ; VI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C11]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) + ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL11]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C11]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL12]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C11]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL13]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C11]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL14]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C13]](s32) + ; VI: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C13]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C13]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -11136,9 +12411,22 @@ body: | ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C13]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; GFX9: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C14]](s64) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p4) :: (load 1, addrspace 1) ; GFX9: [[GEP12:%[0-9]+]]:_(p4) = G_GEP [[GEP11]], [[C]](s64) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p4) :: (load 1, addrspace 1) @@ -11166,39 +12454,51 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C11]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C11]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C11]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] ; GFX9: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; GFX9: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; GFX9: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C11]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) + ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL11]] ; GFX9: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; GFX9: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C11]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) + ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL12]] ; GFX9: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; GFX9: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C11]] - ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) - ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) + ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL13]] ; GFX9: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; GFX9: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C11]] - ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) - ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) + ; GFX9: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL14]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C13]](s32) + ; GFX9: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; GFX9: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; GFX9: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C13]](s32) + ; GFX9: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C13]](s32) + ; GFX9: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -11289,9 +12589,22 @@ body: | ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; CI-MESA: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; CI-MESA: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C14]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; CI-MESA: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI-MESA: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C15]](s64) ; CI-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p4) :: (load 1, addrspace 1) ; CI-MESA: [[GEP12:%[0-9]+]]:_(p4) = G_GEP [[GEP11]], [[C]](s64) ; CI-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p4) :: (load 1, addrspace 1) @@ -11320,50 +12633,62 @@ body: | ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C13]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C13]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] ; CI-MESA: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; CI-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C13]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C13]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C13]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C13]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C14]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C14]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C14]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; CI-MESA: [[COPY24:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; CI-MESA: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY24]](s96) @@ -11442,9 +12767,22 @@ body: | ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; GFX9-MESA: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9-MESA: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C13]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; GFX9-MESA: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9-MESA: [[GEP11:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C14]](s64) ; GFX9-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p4) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP12:%[0-9]+]]:_(p4) = G_GEP [[GEP11]], [[C]](s64) ; GFX9-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p4) :: (load 1, addrspace 1) @@ -11472,39 +12810,51 @@ body: | ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C11]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C11]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C11]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] ; GFX9-MESA: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C11]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL11]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C11]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL12]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C11]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL13]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C11]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL14]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C13]](s32) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C13]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C13]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -11526,248 +12876,378 @@ body: | ; CI-LABEL: name: test_extload_constant_v2s96_from_24_align2 ; CI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; CI: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; CI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C]](s64) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C1]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; CI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C2]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; CI: [[GEP9:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C3]](s64) ; CI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[GEP10:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C4]](s64) ; CI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p4) :: (load 2, addrspace 1) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; CI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; CI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; CI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; CI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; CI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; VI-LABEL: name: test_extload_constant_v2s96_from_24_align2 ; VI: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; VI: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; VI: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; VI: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C1]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; VI: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C2]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[GEP9:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C3]](s64) ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[GEP10:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C4]](s64) ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p4) :: (load 2, addrspace 1) - ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; VI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; VI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; GFX9-LABEL: name: test_extload_constant_v2s96_from_24_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C]](s64) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C1]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; GFX9: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C2]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[GEP9:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C3]](s64) ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[GEP10:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C4]](s64) ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p4) :: (load 2, addrspace 1) - ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; GFX9: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; CI-MESA-LABEL: name: test_extload_constant_v2s96_from_24_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; CI-MESA: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI-MESA: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C]](s64) ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; CI-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C1]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C2]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; CI-MESA: [[GEP9:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C3]](s64) ; CI-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[GEP10:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C4]](s64) ; CI-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p4) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; CI-MESA: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; CI-MESA: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; CI-MESA: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; CI-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; GFX9-MESA-LABEL: name: test_extload_constant_v2s96_from_24_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C3]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C4]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9-MESA: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9-MESA: [[GEP5:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C7]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[GEP6:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C1]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C2]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[GEP9:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C3]](s64) ; GFX9-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[GEP10:%[0-9]+]]:_(p4) = G_GEP [[GEP5]], [[C4]](s64) ; GFX9-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p4) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; GFX9-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; GFX9-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; GFX9-MESA: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; GFX9-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) %0:_(p4) = COPY $vgpr0_vgpr1 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 2, addrspace 1) %2:_(s96) = G_EXTRACT %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index bb8bf7c84d7513..5fb5ad65673a10 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -383,53 +383,78 @@ body: | ; CI-LABEL: name: test_load_flat_s32_align2 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: $vgpr0 = COPY [[OR]](s32) ; VI-LABEL: name: test_load_flat_s32_align2 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: $vgpr0 = COPY [[OR]](s32) ; GFX9-LABEL: name: test_load_flat_s32_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: $vgpr0 = COPY [[OR]](s32) ; CI-MESA-LABEL: name: test_load_flat_s32_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](s32) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: $vgpr0 = COPY [[OR]](s32) ; GFX9-MESA-LABEL: name: test_load_flat_s32_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](s32) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: $vgpr0 = COPY [[OR]](s32) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (load 4, align 2, addrspace 0) $vgpr0 = COPY %1 @@ -471,8 +496,12 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: $vgpr0 = COPY [[OR2]](s32) ; VI-LABEL: name: test_load_flat_s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -499,8 +528,12 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: $vgpr0 = COPY [[OR2]](s32) ; GFX9-LABEL: name: test_load_flat_s32_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -527,8 +560,12 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: $vgpr0 = COPY [[OR2]](s32) ; CI-MESA-LABEL: name: test_load_flat_s32_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -559,8 +596,12 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](s32) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: $vgpr0 = COPY [[OR2]](s32) ; GFX9-MESA-LABEL: name: test_load_flat_s32_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -587,8 +628,12 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](s32) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: $vgpr0 = COPY [[OR2]](s32) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (load 4, align 1, addrspace 0) $vgpr0 = COPY %1 @@ -712,92 +757,142 @@ body: | ; CI-LABEL: name: test_load_flat_s64_align2 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_flat_s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_flat_s64_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-MESA-LABEL: name: test_load_flat_s64_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-MESA-LABEL: name: test_load_flat_s64_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s64) = G_LOAD %0 :: (load 8, align 2, addrspace 0) @@ -868,7 +963,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_flat_s64_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -920,7 +1024,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_flat_s64_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -972,7 +1085,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-MESA-LABEL: name: test_load_flat_s64_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -1032,7 +1154,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-MESA-LABEL: name: test_load_flat_s64_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -1084,7 +1215,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s64) = G_LOAD %0 :: (load 8, align 1, addrspace 0) @@ -1193,132 +1333,202 @@ body: | ; CI-LABEL: name: test_load_flat_s96_align2 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C4]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_flat_s96_align2 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; VI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; VI: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C4]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_load_flat_s96_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C4]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-MESA-LABEL: name: test_load_flat_s96_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI-MESA: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C4]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-MESA-LABEL: name: test_load_flat_s96_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C4]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s96) = G_LOAD %0 :: (load 12, align 2, addrspace 0) @@ -1417,7 +1627,20 @@ body: | ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_flat_s96_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -1493,7 +1716,20 @@ body: | ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_load_flat_s96_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -1569,7 +1805,20 @@ body: | ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-MESA-LABEL: name: test_load_flat_s96_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -1657,7 +1906,20 @@ body: | ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-MESA-LABEL: name: test_load_flat_s96_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -1733,7 +1995,20 @@ body: | ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 0) @@ -2057,7 +2332,24 @@ body: | ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C16]](s32) ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C18]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C18]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C18]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C18]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_flat_s128_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2157,7 +2449,24 @@ body: | ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-LABEL: name: test_load_flat_s128_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2257,7 +2566,24 @@ body: | ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; GFX9: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; CI-MESA-LABEL: name: test_load_flat_s128_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2373,7 +2699,24 @@ body: | ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C16]](s32) ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C18]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C18]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C18]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C18]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-MESA-LABEL: name: test_load_flat_s128_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2473,7 +2816,24 @@ body: | ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 0) @@ -2657,7 +3017,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; VI-LABEL: name: test_load_flat_p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2709,7 +3078,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-LABEL: name: test_load_flat_p1_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2761,7 +3139,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-MESA-LABEL: name: test_load_flat_p1_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2821,7 +3208,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-MESA-LABEL: name: test_load_flat_p1_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -2873,7 +3269,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(p1) = G_LOAD %0 :: (load 8, align 1, addrspace 0) @@ -2982,92 +3387,142 @@ body: | ; CI-LABEL: name: test_load_flat_p4_align2 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; VI-LABEL: name: test_load_flat_p4_align2 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-LABEL: name: test_load_flat_p4_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p4) ; CI-MESA-LABEL: name: test_load_flat_p4_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-MESA-LABEL: name: test_load_flat_p4_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(p4) = G_LOAD %0 :: (load 8, align 2, addrspace 0) @@ -3138,7 +3593,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; VI-LABEL: name: test_load_flat_p4_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -3190,7 +3654,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-LABEL: name: test_load_flat_p4_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -3242,7 +3715,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p4) ; CI-MESA-LABEL: name: test_load_flat_p4_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -3302,7 +3784,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-MESA-LABEL: name: test_load_flat_p4_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -3354,7 +3845,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(p4) = G_LOAD %0 :: (load 8, align 1, addrspace 0) @@ -3401,53 +3901,83 @@ body: | ; CI-LABEL: name: test_load_flat_p5_align2 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_flat_p5_align2 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_flat_p5_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-MESA-LABEL: name: test_load_flat_p5_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](p5) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-MESA-LABEL: name: test_load_flat_p5_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](p5) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX9-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(p5) = G_LOAD %0 :: (load 4, align 2, addrspace 0) $vgpr0 = COPY %1 @@ -3489,8 +4019,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_flat_p5_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -3517,8 +4052,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_flat_p5_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -3545,8 +4085,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-MESA-LABEL: name: test_load_flat_p5_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -3577,8 +4122,13 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](p5) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-MESA-LABEL: name: test_load_flat_p5_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1) @@ -3605,8 +4155,13 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](p5) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX9-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(p5) = G_LOAD %0 :: (load 4, align 1, addrspace 0) $vgpr0 = COPY %1 @@ -6480,166 +7035,256 @@ body: | ; CI-LABEL: name: test_load_flat_v2s64_align2 ; CI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; CI: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C1]](s64) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p0) :: (load 2) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C2]](s64) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p0) :: (load 2) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_flat_v2s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C1]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p0) :: (load 2) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C2]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p0) :: (load 2) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_flat_v2s64_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C1]](s64) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p0) :: (load 2) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C2]](s64) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p0) :: (load 2) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-MESA-LABEL: name: test_load_flat_v2s64_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; CI-MESA: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C1]](s64) ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p0) :: (load 2) - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[GEP6:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C2]](s64) ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p0) :: (load 2) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-MESA-LABEL: name: test_load_flat_v2s64_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C1]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[GEP6:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C2]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p0) :: (load 2) - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p0) = COPY $vgpr0_vgpr1 @@ -6711,9 +7356,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -6734,34 +7388,42 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_flat_v2s64_align1 @@ -6814,9 +7476,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; VI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -6836,27 +7507,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_flat_v2s64_align1 @@ -6909,9 +7588,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -6931,27 +7619,35 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-MESA-LABEL: name: test_load_flat_v2s64_align1 @@ -7012,9 +7708,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -7035,34 +7740,42 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-MESA-LABEL: name: test_load_flat_v2s64_align1 @@ -7115,9 +7828,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -7137,27 +7859,35 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p0) = COPY $vgpr0_vgpr1 @@ -7385,9 +8115,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -7408,36 +8147,44 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C12]](s64) ; CI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; CI: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; CI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -7458,34 +8205,42 @@ body: | ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; CI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; CI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7540,9 +8295,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; VI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -7562,29 +8326,37 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; VI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; VI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; VI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; VI: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; VI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -7604,27 +8376,35 @@ body: | ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; VI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7679,9 +8459,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -7701,29 +8490,37 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; GFX9: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; GFX9: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; GFX9: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -7743,27 +8540,35 @@ body: | ; GFX9: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7826,9 +8631,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -7849,36 +8663,44 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C12]](s64) ; CI-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; CI-MESA: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; CI-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -7899,34 +8721,42 @@ body: | ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7981,9 +8811,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -8003,29 +8842,37 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; GFX9-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; GFX9-MESA: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; GFX9-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -8045,27 +8892,35 @@ body: | ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -8243,9 +9098,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -8266,37 +9130,45 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) + ; CI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C12]](s64) ; CI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; CI: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; CI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -8317,35 +9189,43 @@ body: | ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; CI: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C10]](s64) + ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; CI: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C11]](s64) ; CI: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p0) :: (load 1) ; CI: [[GEP24:%[0-9]+]]:_(p0) = G_GEP [[GEP23]], [[C]](s64) ; CI: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p0) :: (load 1) @@ -8366,34 +9246,42 @@ body: | ; CI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD25]](s32) ; CI: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C9]] - ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) - ; CI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) - ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] + ; CI: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) + ; CI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL18]](s32) + ; CI: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] ; CI: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; CI: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; CI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LOAD27]](s32) ; CI: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C9]] - ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) - ; CI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) - ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] + ; CI: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) + ; CI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL19]](s32) + ; CI: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] ; CI: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; CI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; CI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LOAD29]](s32) ; CI: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C9]] - ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) - ; CI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) - ; CI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] + ; CI: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) + ; CI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL20]](s32) + ; CI: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] ; CI: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; CI: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; CI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LOAD31]](s32) ; CI: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C9]] - ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) - ; CI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) - ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] - ; CI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; CI: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) + ; CI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL21]](s32) + ; CI: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] + ; CI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; CI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; CI: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C10]](s32) + ; CI: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; CI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; CI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; CI: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C10]](s32) + ; CI: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; CI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; CI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64) ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) @@ -8447,9 +9335,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; VI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -8469,30 +9366,38 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; VI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; VI: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; VI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; VI: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; VI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -8512,28 +9417,36 @@ body: | ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; VI: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C9]](s64) + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; VI: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C10]](s64) ; VI: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p0) :: (load 1) ; VI: [[GEP24:%[0-9]+]]:_(p0) = G_GEP [[GEP23]], [[C]](s64) ; VI: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p0) :: (load 1) @@ -8553,27 +9466,35 @@ body: | ; VI: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; VI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; VI: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; VI: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; VI: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; VI: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; VI: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; VI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; VI: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; VI: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; VI: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; VI: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; VI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; VI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; VI: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; VI: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; VI: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; VI: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; VI: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; VI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; VI: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; VI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; VI: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; VI: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; VI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; VI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; VI: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; VI: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; VI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; VI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; VI: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; VI: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; VI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64) ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) @@ -8627,9 +9548,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -8649,30 +9579,38 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; GFX9: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; GFX9: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; GFX9: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -8692,28 +9630,36 @@ body: | ; GFX9: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; GFX9: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C9]](s64) + ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; GFX9: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C10]](s64) ; GFX9: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p0) :: (load 1) ; GFX9: [[GEP24:%[0-9]+]]:_(p0) = G_GEP [[GEP23]], [[C]](s64) ; GFX9: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p0) :: (load 1) @@ -8733,27 +9679,35 @@ body: | ; GFX9: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; GFX9: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; GFX9: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; GFX9: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; GFX9: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; GFX9: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; GFX9: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; GFX9: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; GFX9: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; GFX9: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; GFX9: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; GFX9: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; GFX9: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; GFX9: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; GFX9: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; GFX9: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; GFX9: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; GFX9: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; GFX9: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; GFX9: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; GFX9: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; GFX9: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; GFX9: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; GFX9: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; GFX9: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; GFX9: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; GFX9: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; GFX9: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; GFX9: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; GFX9: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; GFX9: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; GFX9: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; GFX9: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; GFX9: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; GFX9: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) @@ -8815,9 +9769,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -8838,37 +9801,45 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) + ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C12]](s64) ; CI-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; CI-MESA: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; CI-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -8889,35 +9860,43 @@ body: | ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; CI-MESA: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C10]](s64) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; CI-MESA: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C11]](s64) ; CI-MESA: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p0) :: (load 1) ; CI-MESA: [[GEP24:%[0-9]+]]:_(p0) = G_GEP [[GEP23]], [[C]](s64) ; CI-MESA: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p0) :: (load 1) @@ -8938,34 +9917,42 @@ body: | ; CI-MESA: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD25]](s32) ; CI-MESA: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C9]] - ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) - ; CI-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) - ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] + ; CI-MESA: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) + ; CI-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL18]](s32) + ; CI-MESA: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] ; CI-MESA: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; CI-MESA: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; CI-MESA: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LOAD27]](s32) ; CI-MESA: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C9]] - ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) - ; CI-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) - ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] + ; CI-MESA: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) + ; CI-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL19]](s32) + ; CI-MESA: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] ; CI-MESA: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; CI-MESA: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; CI-MESA: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LOAD29]](s32) ; CI-MESA: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C9]] - ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) - ; CI-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) - ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] + ; CI-MESA: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) + ; CI-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL20]](s32) + ; CI-MESA: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] ; CI-MESA: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; CI-MESA: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; CI-MESA: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LOAD31]](s32) ; CI-MESA: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C9]] - ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) - ; CI-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) - ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] - ; CI-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; CI-MESA: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) + ; CI-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL21]](s32) + ; CI-MESA: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] + ; CI-MESA: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; CI-MESA: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; CI-MESA: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C10]](s32) + ; CI-MESA: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; CI-MESA: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; CI-MESA: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; CI-MESA: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C10]](s32) + ; CI-MESA: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; CI-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; CI-MESA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64) ; CI-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) @@ -9019,9 +10006,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -9041,30 +10037,38 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; GFX9-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p0) :: (load 1) ; GFX9-MESA: [[GEP16:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C]](s64) ; GFX9-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p0) :: (load 1) @@ -9084,28 +10088,36 @@ body: | ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; GFX9-MESA: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C9]](s64) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; GFX9-MESA: [[GEP23:%[0-9]+]]:_(p0) = G_GEP [[GEP15]], [[C10]](s64) ; GFX9-MESA: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p0) :: (load 1) ; GFX9-MESA: [[GEP24:%[0-9]+]]:_(p0) = G_GEP [[GEP23]], [[C]](s64) ; GFX9-MESA: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p0) :: (load 1) @@ -9125,27 +10137,35 @@ body: | ; GFX9-MESA: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; GFX9-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; GFX9-MESA: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; GFX9-MESA: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; GFX9-MESA: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; GFX9-MESA: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; GFX9-MESA: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; GFX9-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; GFX9-MESA: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; GFX9-MESA: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; GFX9-MESA: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; GFX9-MESA: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; GFX9-MESA: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; GFX9-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; GFX9-MESA: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; GFX9-MESA: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; GFX9-MESA: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; GFX9-MESA: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; GFX9-MESA: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; GFX9-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; GFX9-MESA: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; GFX9-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; GFX9-MESA: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; GFX9-MESA: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; GFX9-MESA: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; GFX9-MESA: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; GFX9-MESA: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; GFX9-MESA: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; GFX9-MESA: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; GFX9-MESA: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; GFX9-MESA: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; GFX9-MESA: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; GFX9-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; GFX9-MESA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64) ; GFX9-MESA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) @@ -9362,9 +10382,18 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -9385,34 +10414,42 @@ body: | ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; VI-LABEL: name: test_load_flat_v2p1_align1 @@ -9465,9 +10502,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; VI: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -9487,27 +10533,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; GFX9-LABEL: name: test_load_flat_v2p1_align1 @@ -9560,9 +10614,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -9582,27 +10645,35 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; CI-MESA-LABEL: name: test_load_flat_v2p1_align1 @@ -9663,9 +10734,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -9686,34 +10766,42 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; GFX9-MESA-LABEL: name: test_load_flat_v2p1_align1 @@ -9766,9 +10854,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p0) :: (load 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p0) :: (load 1) @@ -9788,27 +10885,35 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) %0:_(p0) = COPY $vgpr0_vgpr1 @@ -9914,9 +11019,14 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C6]](s64) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C7]](s64) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 1) ; CI: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 1) @@ -9929,19 +11039,23 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; VI-LABEL: name: test_load_flat_v2p3_align1 ; VI: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -9969,9 +11083,14 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C6]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 1) ; VI: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 1) @@ -9983,16 +11102,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; GFX9-LABEL: name: test_load_flat_v2p3_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -10020,9 +11143,14 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX9: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C6]](s64) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 1) ; GFX9: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 1) @@ -10034,16 +11162,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; CI-MESA-LABEL: name: test_load_flat_v2p3_align1 ; CI-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -10075,9 +11207,14 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C6]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 1) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 1) @@ -10090,19 +11227,23 @@ body: | ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; CI-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; GFX9-MESA-LABEL: name: test_load_flat_v2p3_align1 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 @@ -10130,9 +11271,14 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C6]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p0) :: (load 1) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p0) :: (load 1) @@ -10144,16 +11290,20 @@ body: | ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load 8, align 1, addrspace 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir index e35914caa725ec..9e7d3338d7867c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir @@ -446,13 +446,18 @@ body: | ; SI-LABEL: name: test_load_global_s32_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: $vgpr0 = COPY [[OR]](s32) ; CI-HSA-LABEL: name: test_load_global_s32_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 4, align 2, addrspace 1) @@ -460,23 +465,33 @@ body: | ; CI-MESA-LABEL: name: test_load_global_s32_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](s32) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: $vgpr0 = COPY [[OR]](s32) ; VI-LABEL: name: test_load_global_s32_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: $vgpr0 = COPY [[OR]](s32) ; GFX9-HSA-LABEL: name: test_load_global_s32_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 4, align 2, addrspace 1) @@ -484,13 +499,18 @@ body: | ; GFX9-MESA-LABEL: name: test_load_global_s32_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](s32) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: $vgpr0 = COPY [[OR]](s32) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (load 4, align 2, addrspace 1) $vgpr0 = COPY %1 @@ -532,8 +552,12 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: $vgpr0 = COPY [[OR2]](s32) ; CI-HSA-LABEL: name: test_load_global_s32_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 4, align 1, addrspace 1) @@ -568,8 +592,12 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](s32) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: $vgpr0 = COPY [[OR2]](s32) ; VI-LABEL: name: test_load_global_s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 1, addrspace 1) @@ -596,8 +624,12 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: $vgpr0 = COPY [[OR2]](s32) ; GFX9-HSA-LABEL: name: test_load_global_s32_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 4, align 1, addrspace 1) @@ -628,8 +660,12 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](s32) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: $vgpr0 = COPY [[OR2]](s32) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = G_LOAD %0 :: (load 4, align 1, addrspace 1) $vgpr0 = COPY %1 @@ -775,20 +811,30 @@ body: | ; SI-LABEL: name: test_load_global_s64_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; SI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-HSA-LABEL: name: test_load_global_s64_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -797,38 +843,58 @@ body: | ; CI-MESA-LABEL: name: test_load_global_s64_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_global_s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-HSA-LABEL: name: test_load_global_s64_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -837,20 +903,30 @@ body: | ; GFX9-MESA-LABEL: name: test_load_global_s64_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = G_LOAD %0 :: (load 8, align 2, addrspace 1) @@ -921,7 +997,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-HSA-LABEL: name: test_load_global_s64_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -985,7 +1070,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_global_s64_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -1037,7 +1131,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-HSA-LABEL: name: test_load_global_s64_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -1093,7 +1196,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = G_LOAD %0 :: (load 8, align 1, addrspace 1) @@ -1214,28 +1326,42 @@ body: | ; SI-LABEL: name: test_load_global_s96_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; SI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; SI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-HSA-LABEL: name: test_load_global_s96_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -1244,54 +1370,82 @@ body: | ; CI-MESA-LABEL: name: test_load_global_s96_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_global_s96_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; VI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-HSA-LABEL: name: test_load_global_s96_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -1300,28 +1454,42 @@ body: | ; GFX9-MESA-LABEL: name: test_load_global_s96_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s96) = G_LOAD %0 :: (load 12, align 2, addrspace 1) @@ -1420,7 +1588,20 @@ body: | ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-HSA-LABEL: name: test_load_global_s96_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -1512,7 +1693,20 @@ body: | ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_global_s96_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -1588,7 +1782,20 @@ body: | ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-HSA-LABEL: name: test_load_global_s96_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -1668,7 +1875,20 @@ body: | ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 1) @@ -1971,7 +2191,24 @@ body: | ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C16]](s32) ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C18]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C18]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C18]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C18]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; CI-HSA-LABEL: name: test_load_global_s128_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2091,7 +2328,24 @@ body: | ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C16]](s32) ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C18]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C18]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C18]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C18]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_global_s128_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2191,7 +2445,24 @@ body: | ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-HSA-LABEL: name: test_load_global_s128_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2295,7 +2566,24 @@ body: | ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]] ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16) ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 1) @@ -2471,7 +2759,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-HSA-LABEL: name: test_load_global_p1_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2535,7 +2832,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p1) ; VI-LABEL: name: test_load_global_p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2587,7 +2893,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-HSA-LABEL: name: test_load_global_p1_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2643,7 +2958,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p1) = G_LOAD %0 :: (load 8, align 1, addrspace 1) @@ -2768,20 +3092,30 @@ body: | ; SI-LABEL: name: test_load_global_p4_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; SI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; CI-HSA-LABEL: name: test_load_global_p4_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2790,38 +3124,58 @@ body: | ; CI-MESA-LABEL: name: test_load_global_p4_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) ; VI-LABEL: name: test_load_global_p4_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-HSA-LABEL: name: test_load_global_p4_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2830,20 +3184,30 @@ body: | ; GFX9-MESA-LABEL: name: test_load_global_p4_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p4) = G_LOAD %0 :: (load 8, align 2, addrspace 1) @@ -2914,7 +3278,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; CI-HSA-LABEL: name: test_load_global_p4_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -2978,7 +3351,16 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) ; VI-LABEL: name: test_load_global_p4_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -3030,7 +3412,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p4) ; GFX9-HSA-LABEL: name: test_load_global_p4_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -3086,7 +3477,16 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[MV]](p4) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p4) = G_LOAD %0 :: (load 8, align 1, addrspace 1) @@ -3137,13 +3537,19 @@ body: | ; SI-LABEL: name: test_load_global_p5_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p5) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-HSA-LABEL: name: test_load_global_p5_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load 4, align 2, addrspace 1) @@ -3151,23 +3557,35 @@ body: | ; CI-MESA-LABEL: name: test_load_global_p5_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](p5) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_global_p5_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-HSA-LABEL: name: test_load_global_p5_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load 4, align 2, addrspace 1) @@ -3175,13 +3593,19 @@ body: | ; GFX9-MESA-LABEL: name: test_load_global_p5_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](p5) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX9-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p5) = G_LOAD %0 :: (load 4, align 2, addrspace 1) $vgpr0 = COPY %1 @@ -3223,8 +3647,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p5) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-HSA-LABEL: name: test_load_global_p5_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load 4, align 1, addrspace 1) @@ -3259,8 +3688,13 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: $vgpr0 = COPY [[MV]](p5) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_global_p5_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 1, addrspace 1) @@ -3287,8 +3721,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-HSA-LABEL: name: test_load_global_p5_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load 4, align 1, addrspace 1) @@ -3319,8 +3758,13 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: $vgpr0 = COPY [[MV]](p5) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX9-MESA: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(p5) = G_LOAD %0 :: (load 4, align 1, addrspace 1) $vgpr0 = COPY %1 @@ -6434,34 +6878,52 @@ body: | ; SI-LABEL: name: test_load_global_v2s64_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; SI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; SI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; SI: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C1]](s64) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C2]](s64) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-HSA-LABEL: name: test_load_global_v2s64_align2 @@ -6471,67 +6933,103 @@ body: | ; CI-MESA-LABEL: name: test_load_global_v2s64_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) + ; CI-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; CI-MESA: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C1]](s64) ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C2]](s64) ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_global_v2s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C1]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C2]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-HSA-LABEL: name: test_load_global_v2s64_align2 @@ -6541,34 +7039,52 @@ body: | ; GFX9-MESA-LABEL: name: test_load_global_v2s64_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9-MESA: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C1]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C2]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p1) = COPY $vgpr0_vgpr1 @@ -6640,9 +7156,18 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; SI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; SI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; SI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -6663,34 +7188,42 @@ body: | ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-HSA-LABEL: name: test_load_global_v2s64_align1 @@ -6755,9 +7288,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -6778,34 +7320,42 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_global_v2s64_align1 @@ -6858,9 +7408,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; VI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -6880,27 +7439,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-HSA-LABEL: name: test_load_global_v2s64_align1 @@ -6957,9 +7524,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -6979,27 +7555,35 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p1) = COPY $vgpr0_vgpr1 @@ -7169,9 +7753,18 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; SI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; SI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; SI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -7192,36 +7785,44 @@ body: | ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; SI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; SI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; SI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; SI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) ; SI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; SI: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; SI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -7242,34 +7843,42 @@ body: | ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; SI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; SI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; SI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; SI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; SI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; SI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; SI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; SI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; SI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; SI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; SI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; SI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; SI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; SI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; SI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; SI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; SI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; SI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7338,9 +7947,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -7361,36 +7979,44 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) ; CI-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; CI-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -7411,34 +8037,42 @@ body: | ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; CI-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; CI-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7493,9 +8127,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; VI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -7515,29 +8158,37 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; VI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; VI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; VI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; VI: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; VI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -7557,27 +8208,35 @@ body: | ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; VI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7638,9 +8297,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -7660,29 +8328,37 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; GFX9-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; GFX9-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -7702,27 +8378,35 @@ body: | ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) ; GFX9-MESA: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF ; GFX9-MESA: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0 @@ -7868,9 +8552,18 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; SI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; SI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; SI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -7891,36 +8584,44 @@ body: | ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; SI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; SI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; SI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; SI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) ; SI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; SI: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; SI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -7941,36 +8642,44 @@ body: | ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; SI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; SI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; SI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; SI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; SI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; SI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; SI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; SI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; SI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; SI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; SI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; SI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; SI: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) + ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; SI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; SI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; SI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; SI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; SI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; SI: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; SI: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C13]](s64) ; SI: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p1) :: (load 1, addrspace 1) ; SI: [[GEP24:%[0-9]+]]:_(p1) = G_GEP [[GEP23]], [[C]](s64) ; SI: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p1) :: (load 1, addrspace 1) @@ -7991,34 +8700,42 @@ body: | ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD25]](s32) ; SI: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C9]] - ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) - ; SI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) - ; SI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] + ; SI: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) + ; SI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL18]](s32) + ; SI: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] ; SI: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; SI: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; SI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LOAD27]](s32) ; SI: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C9]] - ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) - ; SI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) - ; SI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] + ; SI: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) + ; SI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL19]](s32) + ; SI: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] ; SI: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; SI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; SI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LOAD29]](s32) ; SI: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C9]] - ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) - ; SI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) - ; SI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] + ; SI: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) + ; SI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL20]](s32) + ; SI: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] ; SI: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; SI: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; SI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LOAD31]](s32) ; SI: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C9]] - ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) - ; SI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) - ; SI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] - ; SI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; SI: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) + ; SI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL21]](s32) + ; SI: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] + ; SI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; SI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; SI: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C10]](s32) + ; SI: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; SI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; SI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; SI: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C10]](s32) + ; SI: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; SI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; CI-HSA-LABEL: name: test_load_global_v4s64_align1 @@ -8083,9 +8800,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -8106,36 +8832,44 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CI-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CI-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) ; CI-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; CI-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -8156,36 +8890,44 @@ body: | ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C9]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C9]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C9]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C9]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; CI-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; CI-MESA: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; CI-MESA: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; CI-MESA: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C13]](s64) ; CI-MESA: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP24:%[0-9]+]]:_(p1) = G_GEP [[GEP23]], [[C]](s64) ; CI-MESA: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p1) :: (load 1, addrspace 1) @@ -8206,34 +8948,42 @@ body: | ; CI-MESA: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LOAD25]](s32) ; CI-MESA: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C9]] - ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) - ; CI-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) - ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] + ; CI-MESA: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[COPY24]](s32) + ; CI-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[SHL18]](s32) + ; CI-MESA: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[TRUNC25]] ; CI-MESA: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; CI-MESA: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; CI-MESA: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LOAD27]](s32) ; CI-MESA: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C9]] - ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) - ; CI-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) - ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] + ; CI-MESA: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[COPY26]](s32) + ; CI-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[SHL19]](s32) + ; CI-MESA: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[TRUNC27]] ; CI-MESA: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; CI-MESA: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; CI-MESA: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LOAD29]](s32) ; CI-MESA: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C9]] - ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) - ; CI-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) - ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] + ; CI-MESA: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[COPY28]](s32) + ; CI-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[SHL20]](s32) + ; CI-MESA: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[TRUNC29]] ; CI-MESA: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; CI-MESA: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; CI-MESA: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LOAD31]](s32) ; CI-MESA: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C9]] - ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) - ; CI-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) - ; CI-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] - ; CI-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; CI-MESA: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[COPY30]](s32) + ; CI-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[SHL21]](s32) + ; CI-MESA: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[TRUNC31]] + ; CI-MESA: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; CI-MESA: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; CI-MESA: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C10]](s32) + ; CI-MESA: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; CI-MESA: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; CI-MESA: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; CI-MESA: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C10]](s32) + ; CI-MESA: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; CI-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; VI-LABEL: name: test_load_global_v4s64_align1 @@ -8286,9 +9036,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; VI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -8308,29 +9067,37 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; VI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; VI: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; VI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; VI: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; VI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -8350,29 +9117,37 @@ body: | ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; VI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; VI: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; VI: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; VI: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) ; VI: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p1) :: (load 1, addrspace 1) ; VI: [[GEP24:%[0-9]+]]:_(p1) = G_GEP [[GEP23]], [[C]](s64) ; VI: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p1) :: (load 1, addrspace 1) @@ -8392,27 +9167,35 @@ body: | ; VI: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; VI: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; VI: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; VI: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; VI: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; VI: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; VI: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; VI: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; VI: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; VI: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; VI: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; VI: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; VI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; VI: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; VI: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; VI: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; VI: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; VI: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; VI: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; VI: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; VI: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; VI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; VI: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; VI: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; VI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; VI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; VI: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; VI: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; VI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; VI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; VI: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; VI: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; VI: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; GFX9-HSA-LABEL: name: test_load_global_v4s64_align1 @@ -8469,9 +9252,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -8491,29 +9283,37 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) - ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) + ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; GFX9-MESA: [[GEP15:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; GFX9-MESA: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP16:%[0-9]+]]:_(p1) = G_GEP [[GEP15]], [[C]](s64) ; GFX9-MESA: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p1) :: (load 1, addrspace 1) @@ -8533,29 +9333,37 @@ body: | ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL14]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) - ; GFX9-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 - ; GFX9-MESA: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR16]](s32), [[OR17]](s32) + ; GFX9-MESA: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 24 + ; GFX9-MESA: [[GEP23:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C12]](s64) ; GFX9-MESA: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[GEP23]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP24:%[0-9]+]]:_(p1) = G_GEP [[GEP23]], [[C]](s64) ; GFX9-MESA: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[GEP24]](p1) :: (load 1, addrspace 1) @@ -8575,27 +9383,35 @@ body: | ; GFX9-MESA: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC24]], [[C7]] ; GFX9-MESA: [[TRUNC25:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD25]](s32) ; GFX9-MESA: [[AND25:%[0-9]+]]:_(s16) = G_AND [[TRUNC25]], [[C7]] - ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) - ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL12]] + ; GFX9-MESA: [[SHL18:%[0-9]+]]:_(s16) = G_SHL [[AND25]], [[C8]](s16) + ; GFX9-MESA: [[OR18:%[0-9]+]]:_(s16) = G_OR [[AND24]], [[SHL18]] ; GFX9-MESA: [[TRUNC26:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD26]](s32) ; GFX9-MESA: [[AND26:%[0-9]+]]:_(s16) = G_AND [[TRUNC26]], [[C7]] ; GFX9-MESA: [[TRUNC27:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD27]](s32) ; GFX9-MESA: [[AND27:%[0-9]+]]:_(s16) = G_AND [[TRUNC27]], [[C7]] - ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) - ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL13]] + ; GFX9-MESA: [[SHL19:%[0-9]+]]:_(s16) = G_SHL [[AND27]], [[C8]](s16) + ; GFX9-MESA: [[OR19:%[0-9]+]]:_(s16) = G_OR [[AND26]], [[SHL19]] ; GFX9-MESA: [[TRUNC28:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD28]](s32) ; GFX9-MESA: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC28]], [[C7]] ; GFX9-MESA: [[TRUNC29:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD29]](s32) ; GFX9-MESA: [[AND29:%[0-9]+]]:_(s16) = G_AND [[TRUNC29]], [[C7]] - ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) - ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL14]] + ; GFX9-MESA: [[SHL20:%[0-9]+]]:_(s16) = G_SHL [[AND29]], [[C8]](s16) + ; GFX9-MESA: [[OR20:%[0-9]+]]:_(s16) = G_OR [[AND28]], [[SHL20]] ; GFX9-MESA: [[TRUNC30:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD30]](s32) ; GFX9-MESA: [[AND30:%[0-9]+]]:_(s16) = G_AND [[TRUNC30]], [[C7]] ; GFX9-MESA: [[TRUNC31:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD31]](s32) ; GFX9-MESA: [[AND31:%[0-9]+]]:_(s16) = G_AND [[TRUNC31]], [[C7]] - ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) - ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL15]] - ; GFX9-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR12]](s16), [[OR13]](s16), [[OR14]](s16), [[OR15]](s16) + ; GFX9-MESA: [[SHL21:%[0-9]+]]:_(s16) = G_SHL [[AND31]], [[C8]](s16) + ; GFX9-MESA: [[OR21:%[0-9]+]]:_(s16) = G_OR [[AND30]], [[SHL21]] + ; GFX9-MESA: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[OR18]](s16) + ; GFX9-MESA: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[OR19]](s16) + ; GFX9-MESA: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[ZEXT13]], [[C9]](s32) + ; GFX9-MESA: [[OR22:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL22]] + ; GFX9-MESA: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR20]](s16) + ; GFX9-MESA: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[OR21]](s16) + ; GFX9-MESA: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[ZEXT15]], [[C9]](s32) + ; GFX9-MESA: [[OR23:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL23]] + ; GFX9-MESA: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR22]](s32), [[OR23]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) %0:_(p1) = COPY $vgpr0_vgpr1 @@ -8807,9 +9623,18 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; SI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; SI: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; SI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -8830,34 +9655,42 @@ body: | ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; CI-HSA-LABEL: name: test_load_global_v2p1_align1 @@ -8922,9 +9755,18 @@ body: | ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C11]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -8945,34 +9787,42 @@ body: | ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C9]] - ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY8]](s32) + ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C9]] - ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) + ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C9]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C9]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; CI-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; VI-LABEL: name: test_load_global_v2p1_align1 @@ -9025,9 +9875,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; VI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -9047,27 +9906,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align1 @@ -9124,9 +9991,18 @@ body: | ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C9]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C10]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP7]], [[C]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 1, addrspace 1) @@ -9146,27 +10022,35 @@ body: | ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9-MESA: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>) %0:_(p1) = COPY $vgpr0_vgpr1 @@ -9315,9 +10199,14 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; SI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; SI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -9330,19 +10219,23 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; CI-HSA-LABEL: name: test_load_global_v2p3_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -9378,9 +10271,14 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -9393,19 +10291,23 @@ body: | ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; CI-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; VI-LABEL: name: test_load_global_v2p3_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -9433,9 +10335,14 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; VI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -9447,16 +10354,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -9488,9 +10399,14 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -9502,16 +10418,20 @@ body: | ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[MV]](p3), [[MV1]](p3) + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x p3>) = G_LOAD %0 :: (load 8, align 1, addrspace 1) @@ -9839,9 +10759,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; SI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -9854,19 +10778,22 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-HSA-LABEL: name: test_extload_global_v2s32_from_4_align1 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -9906,9 +10833,13 @@ body: | ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -9921,19 +10852,22 @@ body: | ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_global_v2s32_from_4_align1 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -9961,9 +10895,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; VI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -9975,16 +10913,19 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-HSA-LABEL: name: test_extload_global_v2s32_from_4_align1 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -10020,9 +10961,13 @@ body: | ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C6]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[GEP3]], [[C]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 1, addrspace 1) @@ -10034,16 +10979,19 @@ body: | ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 1, addrspace 1) @@ -10059,21 +11007,29 @@ body: | ; SI-LABEL: name: test_extload_global_v2s32_from_4_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; SI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; SI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[GEP1]], [[C]](s64) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-HSA-LABEL: name: test_extload_global_v2s32_from_4_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -10086,40 +11042,56 @@ body: | ; CI-MESA-LABEL: name: test_extload_global_v2s32_from_4_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CI-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) + ; CI-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CI-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[GEP1]], [[C]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; CI-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_global_v2s32_from_4_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; VI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; VI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[GEP1]], [[C]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-HSA-LABEL: name: test_extload_global_v2s32_from_4_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 @@ -10132,21 +11104,29 @@ body: | ; GFX9-MESA-LABEL: name: test_extload_global_v2s32_from_4_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) + ; GFX9-MESA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9-MESA: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[GEP1]], [[C]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; GFX9-MESA: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 2, addrspace 1) @@ -10476,9 +11456,22 @@ body: | ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; SI: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; SI: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C14]](s64) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; SI: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; SI: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C15]](s64) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p1) :: (load 1, addrspace 1) ; SI: [[GEP12:%[0-9]+]]:_(p1) = G_GEP [[GEP11]], [[C]](s64) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p1) :: (load 1, addrspace 1) @@ -10507,50 +11500,62 @@ body: | ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C13]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C13]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] ; SI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; SI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; SI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C13]] - ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; SI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C13]] - ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; SI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; SI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; SI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; SI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C13]] - ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; SI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; SI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; SI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; SI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C13]] - ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; SI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; SI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C14]](s32) + ; SI: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; SI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C14]](s32) + ; SI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; SI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C14]](s32) + ; SI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; SI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; SI: [[COPY24:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; SI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY24]](s96) @@ -10648,9 +11653,22 @@ body: | ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C12]](s32) ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; CI-MESA: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; CI-MESA: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C14]](s64) + ; CI-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-MESA: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CI-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CI-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CI-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; CI-MESA: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI-MESA: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C15]](s64) ; CI-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p1) :: (load 1, addrspace 1) ; CI-MESA: [[GEP12:%[0-9]+]]:_(p1) = G_GEP [[GEP11]], [[C]](s64) ; CI-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p1) :: (load 1, addrspace 1) @@ -10679,50 +11697,62 @@ body: | ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-MESA: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C13]] - ; CI-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) - ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY12]](s32) + ; CI-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; CI-MESA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-MESA: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C13]] - ; CI-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) - ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY14]](s32) + ; CI-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] ; CI-MESA: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; CI-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; CI-MESA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-MESA: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C13]] - ; CI-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) - ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY16]](s32) + ; CI-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) + ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; CI-MESA: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-MESA: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C13]] - ; CI-MESA: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) - ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-MESA: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY18]](s32) + ; CI-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; CI-MESA: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-MESA: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C13]] - ; CI-MESA: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) - ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-MESA: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY20]](s32) + ; CI-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; CI-MESA: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C12]](s32) ; CI-MESA: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-MESA: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C13]] - ; CI-MESA: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) - ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI-MESA: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY22]](s32) + ; CI-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C14]](s32) + ; CI-MESA: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; CI-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; CI-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C14]](s32) + ; CI-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C14]](s32) + ; CI-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; CI-MESA: [[COPY24:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; CI-MESA: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY24]](s96) @@ -10801,9 +11831,22 @@ body: | ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; VI: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; VI: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C13]](s64) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; VI: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C14]](s64) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p1) :: (load 1, addrspace 1) ; VI: [[GEP12:%[0-9]+]]:_(p1) = G_GEP [[GEP11]], [[C]](s64) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p1) :: (load 1, addrspace 1) @@ -10831,39 +11874,51 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C11]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C11]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C11]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] ; VI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C11]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) + ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL11]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C11]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL12]] ; VI: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C11]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL13]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C11]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) + ; VI: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL14]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C13]](s32) + ; VI: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C13]](s32) + ; VI: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C13]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -10949,9 +12004,22 @@ body: | ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C11]] ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C12]](s16) ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; GFX9-MESA: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9-MESA: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C13]](s64) + ; GFX9-MESA: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9-MESA: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-MESA: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; GFX9-MESA: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-MESA: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; GFX9-MESA: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9-MESA: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; GFX9-MESA: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9-MESA: [[GEP11:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C14]](s64) ; GFX9-MESA: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p1) :: (load 1, addrspace 1) ; GFX9-MESA: [[GEP12:%[0-9]+]]:_(p1) = G_GEP [[GEP11]], [[C]](s64) ; GFX9-MESA: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p1) :: (load 1, addrspace 1) @@ -10979,39 +12047,51 @@ body: | ; GFX9-MESA: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C11]] ; GFX9-MESA: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9-MESA: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C11]] - ; GFX9-MESA: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) - ; GFX9-MESA: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C12]](s16) + ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9-MESA: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9-MESA: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; GFX9-MESA: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9-MESA: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C11]] - ; GFX9-MESA: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) - ; GFX9-MESA: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] + ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C12]](s16) + ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] ; GFX9-MESA: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; GFX9-MESA: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; GFX9-MESA: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9-MESA: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C11]] - ; GFX9-MESA: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) - ; GFX9-MESA: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C12]](s16) + ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL11]] ; GFX9-MESA: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9-MESA: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; GFX9-MESA: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9-MESA: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C11]] - ; GFX9-MESA: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) - ; GFX9-MESA: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] + ; GFX9-MESA: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C12]](s16) + ; GFX9-MESA: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL12]] ; GFX9-MESA: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; GFX9-MESA: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; GFX9-MESA: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9-MESA: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C11]] - ; GFX9-MESA: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) - ; GFX9-MESA: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9-MESA: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C12]](s16) + ; GFX9-MESA: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL13]] ; GFX9-MESA: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9-MESA: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; GFX9-MESA: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9-MESA: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C11]] - ; GFX9-MESA: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) - ; GFX9-MESA: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; GFX9-MESA: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C12]](s16) + ; GFX9-MESA: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL14]] + ; GFX9-MESA: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9-MESA: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9-MESA: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C13]](s32) + ; GFX9-MESA: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; GFX9-MESA: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; GFX9-MESA: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9-MESA: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C13]](s32) + ; GFX9-MESA: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; GFX9-MESA: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9-MESA: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; GFX9-MESA: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C13]](s32) + ; GFX9-MESA: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -11033,52 +12113,78 @@ body: | ; SI-LABEL: name: test_extload_global_v2s96_from_24_align2 ; SI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; SI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; SI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; SI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; SI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; SI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; SI: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; SI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; SI: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C]](s64) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C1]](s64) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; SI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C2]](s64) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; SI: [[GEP9:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C3]](s64) ; SI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[GEP10:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C4]](s64) ; SI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p1) :: (load 2, addrspace 1) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; SI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; SI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; SI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; SI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; CI-HSA-LABEL: name: test_extload_global_v2s96_from_24_align2 ; CI-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-HSA: [[LOAD:%[0-9]+]]:_(<2 x s96>) = G_LOAD [[COPY]](p1) :: (load 24, align 2, addrspace 1) @@ -11089,101 +12195,153 @@ body: | ; CI-MESA-LABEL: name: test_extload_global_v2s96_from_24_align2 ; CI-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; CI-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; CI-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; CI-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; CI-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; CI-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CI-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; CI-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CI-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; CI-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; CI-MESA: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; CI-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; CI-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; CI-MESA: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; CI-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-MESA: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C]](s64) ; CI-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; CI-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C1]](s64) ; CI-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; CI-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C2]](s64) ; CI-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; CI-MESA: [[GEP9:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C3]](s64) ; CI-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-MESA: [[GEP10:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C4]](s64) ; CI-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p1) :: (load 2, addrspace 1) - ; CI-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; CI-MESA: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; CI-MESA: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; CI-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; CI-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; CI-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; CI-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; CI-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; CI-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; CI-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; CI-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; CI-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; CI-MESA: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; CI-MESA: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; CI-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; CI-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; VI-LABEL: name: test_extload_global_v2s96_from_24_align2 ; VI: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; VI: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; VI: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; VI: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; VI: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; VI: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; VI: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; VI: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C]](s64) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C1]](s64) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; VI: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C2]](s64) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[GEP9:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C3]](s64) ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[GEP10:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C4]](s64) ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p1) :: (load 2, addrspace 1) - ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; VI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; VI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; GFX9-HSA-LABEL: name: test_extload_global_v2s96_from_24_align2 ; GFX9-HSA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-HSA: [[LOAD:%[0-9]+]]:_(<2 x s96>) = G_LOAD [[COPY]](p1) :: (load 24, align 2, addrspace 1) @@ -11194,52 +12352,78 @@ body: | ; GFX9-MESA-LABEL: name: test_extload_global_v2s96_from_24_align2 ; GFX9-MESA: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-MESA: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9-MESA: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9-MESA: [[GEP:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C]](s64) ; GFX9-MESA: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9-MESA: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-MESA: [[GEP1:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C1]](s64) ; GFX9-MESA: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9-MESA: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 ; GFX9-MESA: [[GEP2:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C2]](s64) ; GFX9-MESA: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; GFX9-MESA: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; GFX9-MESA: [[GEP3:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C3]](s64) ; GFX9-MESA: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9-MESA: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; GFX9-MESA: [[GEP4:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C4]](s64) ; GFX9-MESA: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9-MESA: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9-MESA: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C5]](s64) + ; GFX9-MESA: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9-MESA: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9-MESA: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; GFX9-MESA: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-MESA: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; GFX9-MESA: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9-MESA: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9-MESA: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; GFX9-MESA: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9-MESA: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; GFX9-MESA: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; GFX9-MESA: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9-MESA: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9-MESA: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; GFX9-MESA: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9-MESA: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; GFX9-MESA: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; GFX9-MESA: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9-MESA: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-MESA: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; GFX9-MESA: [[GEP5:%[0-9]+]]:_(p1) = G_GEP [[COPY]], [[C7]](s64) ; GFX9-MESA: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9-MESA: [[GEP6:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C]](s64) ; GFX9-MESA: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9-MESA: [[GEP7:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C1]](s64) ; GFX9-MESA: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; GFX9-MESA: [[GEP8:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C2]](s64) ; GFX9-MESA: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9-MESA: [[GEP9:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C3]](s64) ; GFX9-MESA: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9-MESA: [[GEP10:%[0-9]+]]:_(p1) = G_GEP [[GEP5]], [[C4]](s64) ; GFX9-MESA: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p1) :: (load 2, addrspace 1) - ; GFX9-MESA: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; GFX9-MESA: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; GFX9-MESA: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9-MESA: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9-MESA: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; GFX9-MESA: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9-MESA: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; GFX9-MESA: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; GFX9-MESA: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9-MESA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9-MESA: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; GFX9-MESA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9-MESA: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; GFX9-MESA: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; GFX9-MESA: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GFX9-MESA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9-MESA: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; GFX9-MESA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9-MESA: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; GFX9-MESA: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; GFX9-MESA: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GFX9-MESA: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9-MESA: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; GFX9-MESA: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; GFX9-MESA: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; GFX9-MESA: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 2, addrspace 1) %2:_(s96) = G_EXTRACT %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir index 69a568cce314ee..d5b869a560e913 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -385,53 +385,78 @@ body: | ; SI-LABEL: name: test_load_local_s32_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: $vgpr0 = COPY [[OR]](s32) ; CI-LABEL: name: test_load_local_s32_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: $vgpr0 = COPY [[OR]](s32) ; CI-DS128-LABEL: name: test_load_local_s32_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-DS128: $vgpr0 = COPY [[MV]](s32) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: $vgpr0 = COPY [[OR]](s32) ; VI-LABEL: name: test_load_local_s32_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: $vgpr0 = COPY [[OR]](s32) ; GFX9-LABEL: name: test_load_local_s32_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: $vgpr0 = COPY [[OR]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -473,8 +498,12 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: $vgpr0 = COPY [[OR2]](s32) ; CI-LABEL: name: test_load_local_s32_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -505,8 +534,12 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: $vgpr0 = COPY [[OR2]](s32) ; CI-DS128-LABEL: name: test_load_local_s32_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -537,8 +570,12 @@ body: | ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-DS128: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-DS128: $vgpr0 = COPY [[MV]](s32) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-DS128: $vgpr0 = COPY [[OR2]](s32) ; VI-LABEL: name: test_load_local_s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -565,8 +602,12 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: $vgpr0 = COPY [[OR2]](s32) ; GFX9-LABEL: name: test_load_local_s32_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -593,8 +634,12 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: $vgpr0 = COPY [[OR2]](s32) %0:_(p3) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -708,92 +753,142 @@ body: | ; SI-LABEL: name: test_load_local_s64_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; SI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-LABEL: name: test_load_local_s64_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-DS128-LABEL: name: test_load_local_s64_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI-DS128: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-DS128: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-DS128: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-DS128: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-DS128: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-DS128: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-DS128: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_local_s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; VI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_local_s64_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 2, addrspace 3) @@ -864,7 +959,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-LABEL: name: test_load_local_s64_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -924,7 +1028,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-DS128-LABEL: name: test_load_local_s64_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -984,7 +1097,16 @@ body: | ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-DS128: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_local_s64_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1036,7 +1158,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_local_s64_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1088,7 +1219,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p3) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 1, addrspace 3) @@ -1160,7 +1300,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -1174,21 +1323,24 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; SI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; SI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-LABEL: name: test_load_local_s96_align16 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1249,7 +1401,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -1263,21 +1424,24 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; CI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; CI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-DS128-LABEL: name: test_load_local_s96_align16 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1365,7 +1529,20 @@ body: | ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-DS128: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; CI-DS128: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; CI-DS128: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_local_s96_align16 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1417,9 +1594,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -1431,18 +1617,21 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; GFX9-LABEL: name: test_load_local_s96_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1494,9 +1683,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -1508,18 +1706,21 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 3) @@ -1645,144 +1846,210 @@ body: | ; SI-LABEL: name: test_load_local_s96_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; SI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; SI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; SI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-LABEL: name: test_load_local_s96_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; CI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; CI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-DS128-LABEL: name: test_load_local_s96_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI-DS128: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-DS128: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-DS128: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI-DS128: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-DS128: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-DS128: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-DS128: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-DS128: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_local_s96_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; VI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; GFX9-LABEL: name: test_load_local_s96_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 2, addrspace 3) @@ -1854,7 +2121,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -1868,21 +2144,24 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; SI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; SI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-LABEL: name: test_load_local_s96_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -1943,7 +2222,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -1957,21 +2245,24 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; CI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; CI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-DS128-LABEL: name: test_load_local_s96_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -2059,7 +2350,20 @@ body: | ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-DS128: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; CI-DS128: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; CI-DS128: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_local_s96_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -2111,9 +2415,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -2125,18 +2438,21 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; GFX9-LABEL: name: test_load_local_s96_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -2188,9 +2504,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -2202,18 +2527,21 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 3) @@ -2285,7 +2613,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -2307,34 +2644,42 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; SI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; CI-LABEL: name: test_load_local_s128_align16 @@ -2396,7 +2741,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -2418,34 +2772,42 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; CI-DS128-LABEL: name: test_load_local_s128_align16 @@ -2562,7 +2924,24 @@ body: | ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; CI-DS128: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-DS128: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; CI-DS128: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-DS128: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-DS128: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-DS128: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; CI-DS128: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_local_s128_align16 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -2614,9 +2993,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -2636,27 +3024,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; GFX9-LABEL: name: test_load_local_s128_align16 @@ -2709,9 +3105,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -2731,27 +3136,35 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) %0:_(p3) = COPY $vgpr0 @@ -2862,167 +3275,257 @@ body: | ; SI-LABEL: name: test_load_local_s128_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; SI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; SI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C2]](s32) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; SI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; CI-LABEL: name: test_load_local_s128_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; CI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C2]](s32) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; CI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; CI-DS128-LABEL: name: test_load_local_s128_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI-DS128: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-DS128: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-DS128: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; CI-DS128: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; CI-DS128: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; CI-DS128: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 ; CI-DS128: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; CI-DS128: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; CI-DS128: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C7]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C7]] + ; CI-DS128: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C8]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-DS128: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C7]] + ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-DS128: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C7]] + ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C8]](s32) + ; CI-DS128: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-DS128: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C7]] + ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C7]] + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C8]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-DS128: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-DS128: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C7]] + ; CI-DS128: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-DS128: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C7]] + ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) + ; CI-DS128: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_local_s128_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; VI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C2]](s32) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; VI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; GFX9-LABEL: name: test_load_local_s128_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C2]](s32) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32) ; GFX9: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) %0:_(p3) = COPY $vgpr0 @@ -3095,7 +3598,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -3117,34 +3629,42 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; SI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; CI-LABEL: name: test_load_local_s128_align1 @@ -3206,7 +3726,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -3228,34 +3757,42 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; CI-DS128-LABEL: name: test_load_local_s128_align1 @@ -3372,7 +3909,24 @@ body: | ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32) + ; CI-DS128: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-DS128: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32) + ; CI-DS128: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-DS128: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-DS128: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-DS128: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32) + ; CI-DS128: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_local_s128_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -3424,9 +3978,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -3446,27 +4009,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) ; GFX9-LABEL: name: test_load_local_s128_align1 @@ -3519,9 +4090,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -3541,27 +4121,35 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128) %0:_(p3) = COPY $vgpr0 @@ -3640,92 +4228,142 @@ body: | ; SI-LABEL: name: test_load_local_p1_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; SI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-LABEL: name: test_load_local_p1_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-DS128-LABEL: name: test_load_local_p1_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI-DS128: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CI-DS128: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI-DS128: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-DS128: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-DS128: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI-DS128: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-DS128: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CI-DS128: $vgpr0_vgpr1 = COPY [[MV]](p1) ; VI-LABEL: name: test_load_local_p1_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; VI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-LABEL: name: test_load_local_p1_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p3) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 2, addrspace 3) @@ -3796,7 +4434,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-LABEL: name: test_load_local_p1_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -3856,7 +4503,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-DS128-LABEL: name: test_load_local_p1_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -3916,7 +4572,16 @@ body: | ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-DS128: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-DS128: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-DS128: $vgpr0_vgpr1 = COPY [[MV]](p1) ; VI-LABEL: name: test_load_local_p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -3968,7 +4633,16 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-LABEL: name: test_load_local_p1_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -4020,7 +4694,16 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p3) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 1, addrspace 3) @@ -4067,53 +4750,83 @@ body: | ; SI-LABEL: name: test_load_local_p3_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p3) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p3) ; CI-LABEL: name: test_load_local_p3_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p3) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p3) ; CI-DS128-LABEL: name: test_load_local_p3_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-DS128: $vgpr0 = COPY [[MV]](p3) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; CI-DS128: $vgpr0 = COPY [[INTTOPTR]](p3) ; VI-LABEL: name: test_load_local_p3_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p3) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p3) ; GFX9-LABEL: name: test_load_local_p3_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p3) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p3) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -4155,8 +4868,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p3) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p3) ; CI-LABEL: name: test_load_local_p3_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4187,8 +4905,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p3) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p3) ; CI-DS128-LABEL: name: test_load_local_p3_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4219,8 +4942,13 @@ body: | ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-DS128: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-DS128: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-DS128: $vgpr0 = COPY [[MV]](p3) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-DS128: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI-DS128: $vgpr0 = COPY [[INTTOPTR]](p3) ; VI-LABEL: name: test_load_local_p3_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4247,8 +4975,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p3) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p3) ; GFX9-LABEL: name: test_load_local_p3_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4275,8 +5008,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p3) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p3) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -4322,53 +5060,83 @@ body: | ; SI-LABEL: name: test_load_local_p5_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p5) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-LABEL: name: test_load_local_p5_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-DS128-LABEL: name: test_load_local_p5_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-DS128: $vgpr0 = COPY [[MV]](p5) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI-DS128: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_local_p5_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_local_p5_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p3) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load 4, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -4410,8 +5178,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p5) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-LABEL: name: test_load_local_p5_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4442,8 +5215,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-DS128-LABEL: name: test_load_local_p5_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4474,8 +5252,13 @@ body: | ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-DS128: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-DS128: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-DS128: $vgpr0 = COPY [[MV]](p5) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-DS128: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI-DS128: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_local_p5_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4502,8 +5285,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_local_p5_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3) @@ -4530,8 +5318,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p3) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load 4, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -6446,9 +7239,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6461,19 +7258,22 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-LABEL: name: test_load_local_v2s32_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6505,9 +7305,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6520,19 +7324,22 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-DS128-LABEL: name: test_load_local_v2s32_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6564,9 +7371,13 @@ body: | ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-DS128: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-DS128: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6579,19 +7390,22 @@ body: | ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-DS128: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-DS128: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-DS128: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-DS128: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-DS128: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI-DS128: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_load_local_v2s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6619,9 +7433,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6633,16 +7451,19 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_load_local_v2s32_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6670,9 +7491,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6684,16 +7509,19 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 8, align 1, addrspace 3) @@ -6737,9 +7565,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6752,18 +7584,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -6777,19 +7612,22 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; CI-LABEL: name: test_load_local_v3s32_align16 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6822,9 +7660,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6837,18 +7679,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -6862,19 +7707,22 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; CI-DS128-LABEL: name: test_load_local_v3s32_align16 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6907,9 +7755,13 @@ body: | ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-DS128: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-DS128: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -6922,18 +7774,21 @@ body: | ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-DS128: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-DS128: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-DS128: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-DS128: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-DS128: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-DS128: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI-DS128: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI-DS128: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -6947,19 +7802,22 @@ body: | ; CI-DS128: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-DS128: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI-DS128: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-DS128: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI-DS128: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-DS128: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-DS128: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-DS128: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI-DS128: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-DS128: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-DS128: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-DS128: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_load_local_v3s32_align16 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -6987,9 +7845,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -7001,17 +7863,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -7023,16 +7888,19 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_load_local_v3s32_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -7060,9 +7928,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -7074,17 +7946,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -7096,16 +7971,19 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, align 1, addrspace 3) @@ -7321,181 +8199,251 @@ body: | ; SI-LABEL: name: test_load_local_v4s32_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) - ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; SI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C3]](s32) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; SI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; CI-LABEL: name: test_load_local_v4s32_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) - ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C3]](s32) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; CI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; CI-DS128-LABEL: name: test_load_local_v4s32_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-DS128: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) - ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-DS128: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-DS128: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI-DS128: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; CI-DS128: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-DS128: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI-DS128: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) + ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-DS128: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-DS128: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C3]](s32) ; CI-DS128: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-DS128: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; CI-DS128: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI-DS128: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; CI-DS128: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; CI-DS128: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-DS128: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CI-DS128: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-DS128: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CI-DS128: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-DS128: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) ; CI-DS128: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; VI-LABEL: name: test_load_local_v4s32_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) - ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; VI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C3]](s32) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; GFX9-LABEL: name: test_load_local_v4s32_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C1]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C3]](s32) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) %0:_(p3) = COPY $vgpr0 @@ -7540,9 +8488,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -7555,19 +8507,22 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -7581,19 +8536,22 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C6]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C7]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; SI: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -7606,19 +8564,22 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; SI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32) ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; CI-LABEL: name: test_load_local_v4s32_align1 @@ -7652,9 +8613,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -7667,19 +8632,22 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -7693,19 +8661,22 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C6]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C7]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; CI: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -7718,19 +8689,22 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32) ; CI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; CI-DS128-LABEL: name: test_load_local_v4s32_align1 @@ -7764,9 +8738,13 @@ body: | ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-DS128: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-DS128: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -7779,18 +8757,21 @@ body: | ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-DS128: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-DS128: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-DS128: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-DS128: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-DS128: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-DS128: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI-DS128: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI-DS128: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -7804,20 +8785,23 @@ body: | ; CI-DS128: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-DS128: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI-DS128: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-DS128: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI-DS128: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-DS128: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-DS128: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-DS128: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI-DS128: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-DS128: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-DS128: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-DS128: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI-DS128: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI-DS128: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) + ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-DS128: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI-DS128: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; CI-DS128: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -7830,19 +8814,22 @@ body: | ; CI-DS128: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-DS128: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI-DS128: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-DS128: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-DS128: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI-DS128: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-DS128: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-DS128: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-DS128: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI-DS128: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-DS128: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-DS128: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; CI-DS128: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI-DS128: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-DS128: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-DS128: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI-DS128: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI-DS128: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; VI-LABEL: name: test_load_local_v4s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -7870,9 +8857,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -7884,18 +8875,21 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -7907,16 +8901,19 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C5]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C6]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; VI: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -7928,16 +8925,19 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32) ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) ; GFX9-LABEL: name: test_load_local_v4s32_align1 @@ -7966,9 +8966,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -7980,18 +8984,21 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -8003,16 +9010,19 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C5]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C6]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -8024,16 +9034,19 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV2]](s32), [[MV3]](s32) + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>) %0:_(p3) = COPY $vgpr0 @@ -8411,7 +9424,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -8433,34 +9455,42 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-LABEL: name: test_load_local_v2s64_align16 @@ -8522,7 +9552,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -8544,34 +9583,42 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-DS128-LABEL: name: test_load_local_v2s64_align16 @@ -8633,7 +9680,16 @@ body: | ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI-DS128: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI-DS128: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -8655,34 +9711,42 @@ body: | ; CI-DS128: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-DS128: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI-DS128: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI-DS128: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI-DS128: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI-DS128: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI-DS128: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI-DS128: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-DS128: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI-DS128: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-DS128: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI-DS128: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI-DS128: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] ; CI-DS128: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; CI-DS128: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; CI-DS128: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-DS128: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-DS128: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI-DS128: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-DS128: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI-DS128: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-DS128: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-DS128: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI-DS128: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI-DS128: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-DS128: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI-DS128: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; CI-DS128: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-DS128: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI-DS128: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI-DS128: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; CI-DS128: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CI-DS128: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-DS128: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI-DS128: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI-DS128: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_local_v2s64_align16 @@ -8735,9 +9799,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -8757,27 +9830,35 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_local_v2s64_align16 @@ -8830,9 +9911,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -8852,27 +9942,35 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32) ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16), [[OR6]](s16), [[OR7]](s16) + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32) ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p3) = COPY $vgpr0 @@ -9396,9 +10494,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -9411,19 +10513,22 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-LABEL: name: test_extload_local_v2s32_from_4_align1 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -9455,9 +10560,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -9470,19 +10579,22 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-DS128-LABEL: name: test_extload_local_v2s32_from_4_align1 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -9514,9 +10626,13 @@ body: | ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI-DS128: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI-DS128: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -9529,19 +10645,22 @@ body: | ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI-DS128: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI-DS128: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-DS128: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI-DS128: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI-DS128: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI-DS128: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI-DS128: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI-DS128: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_local_v2s32_from_4_align1 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -9569,9 +10688,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -9583,16 +10706,19 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_extload_local_v2s32_from_4_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 @@ -9620,9 +10746,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 1, addrspace 3) @@ -9634,16 +10764,19 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 1, addrspace 3) @@ -9659,97 +10792,137 @@ body: | ; SI-LABEL: name: test_extload_local_v2s32_from_4_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-LABEL: name: test_extload_local_v2s32_from_4_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-DS128-LABEL: name: test_extload_local_v2s32_from_4_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI-DS128: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-DS128: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-DS128: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI-DS128: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-DS128: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; CI-DS128: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_local_v2s32_from_4_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_extload_local_v2s32_from_4_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 2, addrspace 3) @@ -10019,7 +11192,16 @@ body: | ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -10033,23 +11215,26 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; SI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; SI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; SI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) + ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; SI: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C11]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; SI: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -10070,34 +11255,42 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] ; SI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; SI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; SI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C9]] - ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) - ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) + ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; SI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C9]] - ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) - ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] - ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16) + ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) + ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; SI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; SI: [[OR13:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL13]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; SI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR13]](s32), [[OR14]](s32) ; SI: [[GEP19:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C8]](s32) ; SI: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p3) :: (load 1, addrspace 3) ; SI: [[GEP20:%[0-9]+]]:_(p3) = G_GEP [[GEP19]], [[C]](s32) @@ -10111,21 +11304,24 @@ body: | ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; SI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C9]] - ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) - ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) + ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; SI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; SI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; SI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; SI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C9]] - ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) - ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) + ; SI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) + ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL16]](s32) + ; SI: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; SI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; SI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] ; SI: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; SI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; SI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 + ; SI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; SI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR17]](s32), 64 ; SI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) ; SI: [[COPY26:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY25]](s96) @@ -10189,7 +11385,16 @@ body: | ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) @@ -10203,23 +11408,26 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C9]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C9]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C10]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; CI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; CI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; CI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) + ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; CI: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C11]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; CI: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -10240,34 +11448,42 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C9]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C9]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] ; CI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; CI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; CI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C9]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) - ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) + ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; CI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C9]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) - ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16) + ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) + ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C10]](s32) + ; CI: [[OR13:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL13]] + ; CI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; CI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C10]](s32) + ; CI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR13]](s32), [[OR14]](s32) ; CI: [[GEP19:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C8]](s32) ; CI: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p3) :: (load 1, addrspace 3) ; CI: [[GEP20:%[0-9]+]]:_(p3) = G_GEP [[GEP19]], [[C]](s32) @@ -10281,21 +11497,24 @@ body: | ; CI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C9]] - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) - ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) + ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; CI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C8]](s32) ; CI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C9]] - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) - ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) + ; CI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) + ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL16]](s32) + ; CI: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; CI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C10]](s32) + ; CI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] ; CI: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; CI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; CI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 + ; CI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; CI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR17]](s32), 64 ; CI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) ; CI: [[COPY26:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY25]](s96) @@ -10386,9 +11605,22 @@ body: | ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CI-DS128: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) - ; CI-DS128: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI-DS128: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C13]](s32) + ; CI-DS128: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI-DS128: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI-DS128: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32) + ; CI-DS128: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CI-DS128: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CI-DS128: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32) + ; CI-DS128: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CI-DS128: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI-DS128: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32) + ; CI-DS128: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) + ; CI-DS128: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C14]](s32) ; CI-DS128: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; CI-DS128: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; CI-DS128: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -10417,50 +11649,62 @@ body: | ; CI-DS128: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C7]](s32) ; CI-DS128: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI-DS128: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C12]] - ; CI-DS128: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI-DS128: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI-DS128: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI-DS128: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI-DS128: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI-DS128: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI-DS128: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI-DS128: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C11]] ; CI-DS128: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C7]](s32) ; CI-DS128: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI-DS128: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C12]] - ; CI-DS128: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI-DS128: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI-DS128: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI-DS128: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI-DS128: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] ; CI-DS128: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; CI-DS128: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C11]] ; CI-DS128: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C7]](s32) ; CI-DS128: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI-DS128: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C12]] - ; CI-DS128: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) - ; CI-DS128: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI-DS128: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI-DS128: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) + ; CI-DS128: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) + ; CI-DS128: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI-DS128: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI-DS128: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C11]] ; CI-DS128: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C7]](s32) ; CI-DS128: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI-DS128: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C12]] - ; CI-DS128: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) - ; CI-DS128: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI-DS128: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI-DS128: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) + ; CI-DS128: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI-DS128: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] ; CI-DS128: [[TRUNC20:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD20]](s32) ; CI-DS128: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C11]] ; CI-DS128: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C7]](s32) ; CI-DS128: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI-DS128: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C12]] - ; CI-DS128: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) - ; CI-DS128: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI-DS128: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI-DS128: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) + ; CI-DS128: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI-DS128: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI-DS128: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI-DS128: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C11]] ; CI-DS128: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C7]](s32) ; CI-DS128: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI-DS128: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C12]] - ; CI-DS128: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) - ; CI-DS128: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI-DS128: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI-DS128: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16), [[OR10]](s16), [[OR11]](s16) + ; CI-DS128: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) + ; CI-DS128: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; CI-DS128: [[OR14:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI-DS128: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI-DS128: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI-DS128: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C13]](s32) + ; CI-DS128: [[OR15:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL15]] + ; CI-DS128: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; CI-DS128: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI-DS128: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C13]](s32) + ; CI-DS128: [[OR16:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL16]] + ; CI-DS128: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI-DS128: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR14]](s16) + ; CI-DS128: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C13]](s32) + ; CI-DS128: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI-DS128: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR15]](s32), [[OR16]](s32), [[OR17]](s32) ; CI-DS128: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV]](s96) ; CI-DS128: [[COPY26:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY25]](s96) @@ -10515,9 +11759,18 @@ body: | ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -10529,20 +11782,23 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; VI: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C11]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; VI: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -10562,28 +11818,36 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] ; VI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL11]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16) - ; VI: [[GEP19:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C9]](s32) + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL12]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; VI: [[OR13:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL13]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; VI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR13]](s32), [[OR14]](s32) + ; VI: [[GEP19:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C10]](s32) ; VI: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p3) :: (load 1, addrspace 3) ; VI: [[GEP20:%[0-9]+]]:_(p3) = G_GEP [[GEP19]], [[C]](s32) ; VI: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[GEP20]](p3) :: (load 1, addrspace 3) @@ -10595,18 +11859,21 @@ body: | ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL15]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) + ; VI: [[SHL16:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; VI: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] ; VI: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 + ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR17]](s32), 64 ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -10661,9 +11928,18 @@ body: | ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]] ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16) ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C9]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 1, addrspace 3) @@ -10675,20 +11951,23 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C10]](s32) + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR8]](s32), 64 + ; GFX9: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP11:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C11]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP12:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p3) :: (load 1, addrspace 3) @@ -10708,28 +11987,36 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] ; GFX9: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD16]](s32) ; GFX9: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C7]] ; GFX9: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C7]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C8]](s16) + ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL11]] ; GFX9: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C7]] ; GFX9: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C7]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16), [[OR8]](s16), [[OR9]](s16) - ; GFX9: [[GEP19:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C9]](s32) + ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C8]](s16) + ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL12]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32) + ; GFX9: [[OR13:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL13]] + ; GFX9: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR11]](s16) + ; GFX9: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C9]](s32) + ; GFX9: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR13]](s32), [[OR14]](s32) + ; GFX9: [[GEP19:%[0-9]+]]:_(p3) = G_GEP [[GEP11]], [[C10]](s32) ; GFX9: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p3) :: (load 1, addrspace 3) ; GFX9: [[GEP20:%[0-9]+]]:_(p3) = G_GEP [[GEP19]], [[C]](s32) ; GFX9: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[GEP20]](p3) :: (load 1, addrspace 3) @@ -10741,18 +12028,21 @@ body: | ; GFX9: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C7]] ; GFX9: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C7]] - ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) - ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C8]](s16) + ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL15]] ; GFX9: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C7]] ; GFX9: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C7]] - ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) - ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) + ; GFX9: [[SHL16:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C8]](s16) + ; GFX9: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL16]] + ; GFX9: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; GFX9: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C9]](s32) + ; GFX9: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] ; GFX9: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 + ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR17]](s32), 64 ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -10774,276 +12064,398 @@ body: | ; SI-LABEL: name: test_extload_local_v2s96_from_24_align2 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; SI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; SI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; SI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; SI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) + ; SI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C1]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; SI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C2]](s32) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16) - ; SI: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C3]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32) + ; SI: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C5]](s32) ; SI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[GEP10:%[0-9]+]]:_(p3) = G_GEP [[GEP9]], [[C]](s32) ; SI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p3) :: (load 2, addrspace 3) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C4]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] ; SI: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; SI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; SI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 - ; SI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) - ; SI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; SI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; SI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR5]](s32), 64 + ; SI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; SI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; CI-LABEL: name: test_extload_local_v2s96_from_24_align2 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; CI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; CI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) + ; CI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C1]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; CI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C2]](s32) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16) - ; CI: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C3]](s32) + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32) + ; CI: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C5]](s32) ; CI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[GEP10:%[0-9]+]]:_(p3) = G_GEP [[GEP9]], [[C]](s32) ; CI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p3) :: (load 2, addrspace 3) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) + ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C4]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] ; CI: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; CI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; CI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 - ; CI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) - ; CI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) - ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; CI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR5]](s32), 64 + ; CI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; CI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; CI-DS128-LABEL: name: test_extload_local_v2s96_from_24_align2 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-DS128: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI-DS128: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; CI-DS128: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI-DS128: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; CI-DS128: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) ; CI-DS128: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI-DS128: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) ; CI-DS128: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI-DS128: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 ; CI-DS128: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) ; CI-DS128: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI-DS128: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI-DS128: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) + ; CI-DS128: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI-DS128: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI-DS128: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; CI-DS128: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI-DS128: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C6]](s32) + ; CI-DS128: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI-DS128: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] + ; CI-DS128: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI-DS128: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; CI-DS128: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C6]](s32) + ; CI-DS128: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI-DS128: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI-DS128: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; CI-DS128: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI-DS128: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; CI-DS128: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32) + ; CI-DS128: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI-DS128: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; CI-DS128: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C7]](s32) ; CI-DS128: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI-DS128: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; CI-DS128: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; CI-DS128: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C1]](s32) ; CI-DS128: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; CI-DS128: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C2]](s32) ; CI-DS128: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; CI-DS128: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C3]](s32) ; CI-DS128: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI-DS128: [[GEP10:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C4]](s32) ; CI-DS128: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p3) :: (load 2, addrspace 3) - ; CI-DS128: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; CI-DS128: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16) - ; CI-DS128: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) - ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI-DS128: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI-DS128: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; CI-DS128: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI-DS128: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; CI-DS128: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CI-DS128: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI-DS128: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI-DS128: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; CI-DS128: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI-DS128: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; CI-DS128: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C6]](s32) + ; CI-DS128: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; CI-DS128: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI-DS128: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; CI-DS128: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI-DS128: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] + ; CI-DS128: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C6]](s32) + ; CI-DS128: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; CI-DS128: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; CI-DS128: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; CI-DS128: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; VI-LABEL: name: test_extload_local_v2s96_from_24_align2 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; VI: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; VI: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C1]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; VI: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C2]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16) - ; VI: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C3]](s32) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32) + ; VI: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C5]](s32) ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[GEP10:%[0-9]+]]:_(p3) = G_GEP [[GEP9]], [[C]](s32) ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p3) :: (load 2, addrspace 3) - ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C4]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] ; VI: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR5]](s32), 64 + ; VI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; VI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; GFX9-LABEL: name: test_extload_local_v2s96_from_24_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9: [[GEP1:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C1]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 ; GFX9: [[GEP2:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C2]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C3]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C5]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p3) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[MV]](s64), 0 - ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[MV1]](s32), 64 - ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C4]](s32) + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[OR2]](s32), 64 + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP5:%[0-9]+]]:_(p3) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C]](s32) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[GEP7:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C1]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; GFX9: [[GEP8:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C2]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16) - ; GFX9: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C3]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C4]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32) + ; GFX9: [[GEP9:%[0-9]+]]:_(p3) = G_GEP [[GEP5]], [[C5]](s32) ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[GEP10:%[0-9]+]]:_(p3) = G_GEP [[GEP9]], [[C]](s32) ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p3) :: (load 2, addrspace 3) - ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C4]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] ; GFX9: [[DEF1:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF - ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV2]](s64), 0 - ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[MV3]](s32), 64 - ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) - ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF1]], [[MV1]](s64), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[OR5]](s32), 64 + ; GFX9: [[COPY13:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; GFX9: [[COPY14:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 2, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index d996cfac4bedf1..ed819e4658b265 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -328,43 +328,63 @@ body: | ; SI-LABEL: name: test_load_private_s32_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: $vgpr0 = COPY [[OR]](s32) ; CI-LABEL: name: test_load_private_s32_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: $vgpr0 = COPY [[OR]](s32) ; VI-LABEL: name: test_load_private_s32_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: $vgpr0 = COPY [[OR]](s32) ; GFX9-LABEL: name: test_load_private_s32_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 2, addrspace 5) $vgpr0 = COPY %1 @@ -406,8 +426,12 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: $vgpr0 = COPY [[OR2]](s32) ; CI-LABEL: name: test_load_private_s32_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -438,8 +462,12 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: $vgpr0 = COPY [[OR2]](s32) ; VI-LABEL: name: test_load_private_s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -466,8 +494,12 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: $vgpr0 = COPY [[OR2]](s32) ; GFX9-LABEL: name: test_load_private_s32_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -494,8 +526,12 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: $vgpr0 = COPY [[OR2]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load 4, align 1, addrspace 5) $vgpr0 = COPY %1 @@ -664,79 +700,111 @@ body: | ; SI-LABEL: name: test_load_private_s64_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; SI: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; SI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-LABEL: name: test_load_private_s64_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; CI: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_private_s64_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; VI: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_private_s64_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; GFX9: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -778,9 +846,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -793,20 +865,23 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; SI: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; SI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; CI-LABEL: name: test_load_private_s64_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -837,9 +912,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -852,20 +931,23 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; CI: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; VI-LABEL: name: test_load_private_s64_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -892,9 +974,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -906,17 +992,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; VI: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; VI: $vgpr0_vgpr1 = COPY [[MV]](s64) ; GFX9-LABEL: name: test_load_private_s64_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -943,9 +1032,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -957,17 +1050,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; GFX9: $vgpr0_vgpr1 = COPY [[MV2]](s64) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load 8, align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -1010,9 +1106,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -1025,18 +1125,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -1050,20 +1153,23 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-LABEL: name: test_load_private_s96_align16 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 56) @@ -1095,9 +1201,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -1110,18 +1220,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -1135,20 +1248,23 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_private_s96_align16 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 56) @@ -1175,9 +1291,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -1189,17 +1309,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -1211,17 +1334,20 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_load_private_s96_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 56) @@ -1248,9 +1374,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -1262,17 +1392,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -1284,17 +1417,20 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 56) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -1419,111 +1555,155 @@ body: | ; SI-LABEL: name: test_load_private_s96_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; SI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-LABEL: name: test_load_private_s96_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_private_s96_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; VI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_load_private_s96_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 2, addrspace 5) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -1566,9 +1746,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -1581,18 +1765,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -1606,20 +1793,23 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; CI-LABEL: name: test_load_private_s96_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -1651,9 +1841,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -1666,18 +1860,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -1691,20 +1888,23 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; VI-LABEL: name: test_load_private_s96_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -1731,9 +1931,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -1745,17 +1949,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -1767,17 +1974,20 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) ; GFX9-LABEL: name: test_load_private_s96_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -1804,9 +2014,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -1818,17 +2032,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -1840,17 +2057,20 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV3]](s96) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 1, addrspace 5) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -1893,9 +2113,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -1908,18 +2132,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -1933,20 +2160,23 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; SI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -1959,20 +2189,23 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; SI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; CI-LABEL: name: test_load_private_s128_align16 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 56) @@ -2004,9 +2237,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -2019,18 +2256,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -2044,20 +2284,23 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; CI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -2070,20 +2313,23 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_private_s128_align16 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 56) @@ -2110,9 +2356,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -2124,17 +2374,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -2146,17 +2399,20 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; VI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -2168,17 +2424,20 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; VI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-LABEL: name: test_load_private_s128_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 56) @@ -2205,9 +2464,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -2219,17 +2482,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -2241,17 +2507,20 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -2263,17 +2532,20 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 56) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -2422,143 +2694,199 @@ body: | ; SI-LABEL: name: test_load_private_s128_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; SI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; CI-LABEL: name: test_load_private_s128_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; CI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_private_s128_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; VI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-LABEL: name: test_load_private_s128_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; GFX9: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 2, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -2601,9 +2929,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -2616,18 +2948,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -2641,20 +2976,23 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; SI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -2667,20 +3005,23 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; SI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; CI-LABEL: name: test_load_private_s128_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -2712,9 +3053,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -2727,18 +3072,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -2752,20 +3100,23 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; CI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -2778,20 +3129,23 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; VI-LABEL: name: test_load_private_s128_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -2818,9 +3172,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -2832,17 +3190,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -2854,17 +3215,20 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; VI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -2876,17 +3240,20 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; VI: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) ; GFX9-LABEL: name: test_load_private_s128_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -2913,9 +3280,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -2927,17 +3298,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -2949,17 +3323,20 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -2971,17 +3348,20 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[MV4:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV4]](s128) + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -3082,79 +3462,111 @@ body: | ; SI-LABEL: name: test_load_private_p1_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; SI: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; SI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-LABEL: name: test_load_private_p1_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; CI: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; VI-LABEL: name: test_load_private_p1_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; VI: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; VI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-LABEL: name: test_load_private_p1_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; GFX9: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -3196,9 +3608,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -3211,20 +3627,23 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; SI: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; SI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; CI-LABEL: name: test_load_private_p1_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3255,9 +3674,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -3270,20 +3693,23 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; CI: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; VI-LABEL: name: test_load_private_p1_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3310,9 +3736,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -3324,17 +3754,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; VI: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; VI: $vgpr0_vgpr1 = COPY [[MV]](p1) ; GFX9-LABEL: name: test_load_private_p1_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3361,9 +3794,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -3375,17 +3812,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; GFX9: $vgpr0_vgpr1 = COPY [[MV2]](p1) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load 8, align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -3427,43 +3867,67 @@ body: | ; SI-LABEL: name: test_load_private_p3_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p3) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p3) ; CI-LABEL: name: test_load_private_p3_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p3) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p3) ; VI-LABEL: name: test_load_private_p3_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p3) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p3) ; GFX9-LABEL: name: test_load_private_p3_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p3) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load 4, align 2, addrspace 5) $vgpr0 = COPY %1 @@ -3505,8 +3969,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p3) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p3) ; CI-LABEL: name: test_load_private_p3_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3537,8 +4006,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p3) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p3) ; VI-LABEL: name: test_load_private_p3_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3565,8 +4039,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p3) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p3) ; GFX9-LABEL: name: test_load_private_p3_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3593,8 +4072,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p3) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load 4, align 1, addrspace 5) $vgpr0 = COPY %1 @@ -3636,43 +4120,67 @@ body: | ; SI-LABEL: name: test_load_private_p5_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p5) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-LABEL: name: test_load_private_p5_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_private_p5_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_private_p5_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load 4, align 2, addrspace 5) $vgpr0 = COPY %1 @@ -3714,8 +4222,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: $vgpr0 = COPY [[MV]](p5) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; SI: $vgpr0 = COPY [[INTTOPTR]](p5) ; CI-LABEL: name: test_load_private_p5_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3746,8 +4259,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: $vgpr0 = COPY [[MV]](p5) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; CI: $vgpr0 = COPY [[INTTOPTR]](p5) ; VI-LABEL: name: test_load_private_p5_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3774,8 +4292,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: $vgpr0 = COPY [[MV]](p5) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; VI: $vgpr0 = COPY [[INTTOPTR]](p5) ; GFX9-LABEL: name: test_load_private_p5_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 1, addrspace 5) @@ -3802,8 +4325,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(p5) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: $vgpr0 = COPY [[MV]](p5) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; GFX9: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load 4, align 1, addrspace 5) $vgpr0 = COPY %1 @@ -5489,9 +6017,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -5504,19 +6036,22 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-LABEL: name: test_load_private_v2s32_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5548,9 +6083,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -5563,19 +6102,22 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_load_private_v2s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5603,9 +6145,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -5617,16 +6163,19 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_load_private_v2s32_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5654,9 +6203,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -5668,16 +6221,19 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 8, align 1, addrspace 5) @@ -5721,9 +6277,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -5736,18 +6296,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -5761,19 +6324,22 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; CI-LABEL: name: test_load_private_v3s32_align16 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5806,9 +6372,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -5821,18 +6391,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -5846,19 +6419,22 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; VI-LABEL: name: test_load_private_v3s32_align16 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5886,9 +6462,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -5900,17 +6480,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -5922,16 +6505,19 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; GFX9-LABEL: name: test_load_private_v3s32_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -5959,9 +6545,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -5973,17 +6563,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -5995,16 +6588,19 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, align 1, addrspace 56) @@ -6103,9 +6699,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -6118,18 +6718,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -6143,20 +6746,23 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; SI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -6169,19 +6775,22 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; CI-LABEL: name: test_load_private_v4s32_align16 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -6214,9 +6823,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -6229,18 +6842,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -6254,20 +6870,23 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; CI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -6280,19 +6899,22 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; VI-LABEL: name: test_load_private_v4s32_align16 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -6320,9 +6942,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -6334,17 +6960,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -6356,17 +6985,20 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; VI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -6378,16 +7010,19 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; GFX9-LABEL: name: test_load_private_v4s32_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -6415,9 +7050,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 56) @@ -6429,17 +7068,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 56) @@ -6451,17 +7093,20 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 56) ; GFX9: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 56) @@ -6473,16 +7118,19 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 1, addrspace 56) @@ -6632,142 +7280,198 @@ body: | ; SI-LABEL: name: test_load_private_v4s32_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; CI-LABEL: name: test_load_private_v4s32_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; VI-LABEL: name: test_load_private_v4s32_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; GFX9-LABEL: name: test_load_private_v4s32_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) - ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 2, addrspace 5) @@ -6811,9 +7515,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -6826,18 +7534,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -6851,20 +7562,23 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; SI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -6877,19 +7591,22 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; CI-LABEL: name: test_load_private_v4s32_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -6922,9 +7639,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -6937,18 +7658,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -6962,20 +7686,23 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; CI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -6988,19 +7715,22 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; VI-LABEL: name: test_load_private_v4s32_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -7028,9 +7758,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -7042,17 +7776,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -7064,17 +7801,20 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; VI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -7086,16 +7826,19 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; GFX9-LABEL: name: test_load_private_v4s32_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -7123,9 +7866,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -7137,17 +7884,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -7159,17 +7909,20 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -7181,16 +7934,19 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32), [[MV2]](s32), [[MV3]](s32) + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 1, addrspace 5) @@ -7615,9 +8371,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -7630,19 +8390,22 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -7656,19 +8419,22 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C6]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C7]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; SI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -7681,20 +8447,23 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; SI: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV3]](s32), [[MV4]](s32) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV5]](s64) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; SI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; CI-LABEL: name: test_load_private_v2s64_align16 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -7727,9 +8496,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -7742,19 +8515,22 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -7768,19 +8544,22 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C6]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C7]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; CI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -7793,20 +8572,23 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV3]](s32), [[MV4]](s32) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV5]](s64) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; CI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; VI-LABEL: name: test_load_private_v2s64_align16 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -7834,9 +8616,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -7848,18 +8634,21 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -7871,16 +8660,19 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C5]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C6]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; VI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -7892,17 +8684,20 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; VI: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV3]](s32), [[MV4]](s32) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV5]](s64) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_private_v2s64_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -7930,9 +8725,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -7944,18 +8743,21 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -7967,16 +8769,19 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV3:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C5]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C6]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -7988,17 +8793,20 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV3]](s32), [[MV4]](s32) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV5]](s64) + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load 16, align 1, addrspace 5) @@ -8715,9 +9523,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -8730,19 +9542,22 @@ body: | ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-LABEL: name: test_extload_private_v2s32_from_4_align1 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -8774,9 +9589,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -8789,19 +9608,22 @@ body: | ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY4]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY6]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_private_v2s32_from_4_align1 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -8829,9 +9651,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -8843,16 +9669,19 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_extload_private_v2s32_from_4_align1 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 @@ -8880,9 +9709,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -8894,16 +9727,19 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 1, addrspace 5) @@ -8919,78 +9755,110 @@ body: | ; SI-LABEL: name: test_extload_private_v2s32_from_4_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; CI-LABEL: name: test_extload_private_v2s32_from_4_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; CI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; VI-LABEL: name: test_extload_private_v2s32_from_4_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; GFX9-LABEL: name: test_extload_private_v2s32_from_4_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MV]](s32), [[MV1]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load 4, align 2, addrspace 5) @@ -9203,9 +10071,13 @@ body: | ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -9218,18 +10090,21 @@ body: | ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -9243,21 +10118,24 @@ body: | ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; SI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; SI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; SI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; SI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; SI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; SI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; SI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -9270,19 +10148,22 @@ body: | ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; SI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; SI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; SI: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C6]](s32) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; SI: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C7]](s32) ; SI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p5) :: (load 1, addrspace 5) ; SI: [[GEP16:%[0-9]+]]:_(p5) = G_GEP [[GEP15]], [[C]](s32) ; SI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p5) :: (load 1, addrspace 5) @@ -9295,18 +10176,21 @@ body: | ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; SI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C5]] - ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) - ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; SI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) + ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; SI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; SI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C3]] ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C5]] - ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) - ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] - ; SI: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16) + ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) + ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; SI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C6]](s32) + ; SI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] ; SI: [[GEP19:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C4]](s32) ; SI: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p5) :: (load 1, addrspace 5) ; SI: [[GEP20:%[0-9]+]]:_(p5) = G_GEP [[GEP19]], [[C]](s32) @@ -9320,21 +10204,24 @@ body: | ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; SI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C5]] - ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) - ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) + ; SI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; SI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; SI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; SI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C3]] ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; SI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C5]] - ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) - ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; SI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; SI: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) - ; SI: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; SI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; SI: [[COPY26:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) + ; SI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) + ; SI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL16]](s32) + ; SI: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; SI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C6]](s32) + ; SI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; SI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; SI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; SI: [[COPY26:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY25]](s96) ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY26]](s96) ; CI-LABEL: name: test_extload_private_v2s96_from_24_align1 @@ -9368,9 +10255,13 @@ body: | ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY3]](s32) ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -9383,18 +10274,21 @@ body: | ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] - ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) - ; CI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[COPY5]](s32) + ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC5]] ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] - ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) - ; CI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[COPY7]](s32) + ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C6]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) @@ -9408,21 +10302,24 @@ body: | ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] - ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) - ; CI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] + ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[COPY9]](s32) + ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[TRUNC9]] ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C5]] - ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) - ; CI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; CI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY11]](s32) + ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) + ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] + ; CI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; CI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C6]](s32) + ; CI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; CI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; CI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; CI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; CI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -9435,19 +10332,22 @@ body: | ; CI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32) ; CI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] - ; CI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) - ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) - ; CI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] + ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[COPY13]](s32) + ; CI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[TRUNC13]] ; CI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; CI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; CI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32) ; CI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C5]] - ; CI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) - ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32) - ; CI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] - ; CI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; CI: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C6]](s32) + ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[COPY15]](s32) + ; CI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) + ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[TRUNC15]] + ; CI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C6]](s32) + ; CI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; CI: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C7]](s32) ; CI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p5) :: (load 1, addrspace 5) ; CI: [[GEP16:%[0-9]+]]:_(p5) = G_GEP [[GEP15]], [[C]](s32) ; CI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p5) :: (load 1, addrspace 5) @@ -9460,18 +10360,21 @@ body: | ; CI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LOAD17]](s32) ; CI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C5]] - ; CI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) - ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) - ; CI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] + ; CI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[COPY17]](s32) + ; CI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[SHL12]](s32) + ; CI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[TRUNC17]] ; CI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; CI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C3]] ; CI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LOAD19]](s32) ; CI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C5]] - ; CI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) - ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) - ; CI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] - ; CI: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16) + ; CI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[COPY19]](s32) + ; CI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; CI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[TRUNC19]] + ; CI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; CI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; CI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C6]](s32) + ; CI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] ; CI: [[GEP19:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C4]](s32) ; CI: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p5) :: (load 1, addrspace 5) ; CI: [[GEP20:%[0-9]+]]:_(p5) = G_GEP [[GEP19]], [[C]](s32) @@ -9485,21 +10388,24 @@ body: | ; CI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LOAD21]](s32) ; CI: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C5]] - ; CI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) - ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32) - ; CI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] + ; CI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[COPY21]](s32) + ; CI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[SHL15]](s32) + ; CI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[TRUNC21]] ; CI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; CI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C3]] ; CI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; CI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LOAD23]](s32) ; CI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C5]] - ; CI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) - ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL11]](s32) - ; CI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] - ; CI: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) - ; CI: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; CI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; CI: [[COPY26:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) + ; CI: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[COPY23]](s32) + ; CI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[SHL16]](s32) + ; CI: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[TRUNC23]] + ; CI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; CI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; CI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C6]](s32) + ; CI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; CI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; CI: [[COPY25:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; CI: [[COPY26:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY25]](s96) ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY26]](s96) ; VI-LABEL: name: test_extload_private_v2s96_from_24_align1 @@ -9528,9 +10434,13 @@ body: | ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -9542,17 +10452,20 @@ body: | ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -9564,18 +10477,21 @@ body: | ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; VI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; VI: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -9587,16 +10503,19 @@ body: | ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; VI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; VI: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C5]](s32) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; VI: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C6]](s32) ; VI: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p5) :: (load 1, addrspace 5) ; VI: [[GEP16:%[0-9]+]]:_(p5) = G_GEP [[GEP15]], [[C]](s32) ; VI: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p5) :: (load 1, addrspace 5) @@ -9608,16 +10527,19 @@ body: | ; VI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C3]] ; VI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; VI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C3]] - ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C4]](s16) - ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; VI: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C4]](s16) + ; VI: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; VI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; VI: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C3]] ; VI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; VI: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C3]] - ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C4]](s16) - ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] - ; VI: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16) - ; VI: [[GEP19:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C6]](s32) + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C4]](s16) + ; VI: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] + ; VI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; VI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; VI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C5]](s32) + ; VI: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] + ; VI: [[GEP19:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C7]](s32) ; VI: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p5) :: (load 1, addrspace 5) ; VI: [[GEP20:%[0-9]+]]:_(p5) = G_GEP [[GEP19]], [[C]](s32) ; VI: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[GEP20]](p5) :: (load 1, addrspace 5) @@ -9629,18 +10551,21 @@ body: | ; VI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C3]] ; VI: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; VI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C3]] - ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C4]](s16) - ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; VI: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C4]](s16) + ; VI: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL15]] ; VI: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; VI: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C3]] ; VI: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; VI: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C3]] - ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C4]](s16) - ; VI: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; VI: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) - ; VI: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) + ; VI: [[SHL16:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C4]](s16) + ; VI: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL16]] + ; VI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; VI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; VI: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C5]](s32) + ; VI: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) ; GFX9-LABEL: name: test_extload_private_v2s96_from_24_align1 @@ -9669,9 +10594,13 @@ body: | ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]] - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) + ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 1, addrspace 5) @@ -9683,17 +10612,20 @@ body: | ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]] ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C3]] - ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) - ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C4]](s16) + ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL3]] ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C3]] ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]] - ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) - ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]] - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR3]](s16) - ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C6]](s32) + ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C4]](s16) + ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL4]] + ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 1, addrspace 5) @@ -9705,18 +10637,21 @@ body: | ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C3]] ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C3]] - ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) - ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C4]](s16) + ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]] ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C3]] ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C3]] - ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) - ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]] - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR4]](s16), [[OR5]](s16) - ; GFX9: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C7]](s32) + ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C4]](s16) + ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]] + ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16) + ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32) + ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP11:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C8]](s32) ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[GEP11]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP12:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C]](s32) ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[GEP12]](p5) :: (load 1, addrspace 5) @@ -9728,16 +10663,19 @@ body: | ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C3]] ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32) ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C3]] - ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) - ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]] + ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C4]](s16) + ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL9]] ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32) ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C3]] ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32) ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C3]] - ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) - ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]] - ; GFX9: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR6]](s16), [[OR7]](s16) - ; GFX9: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C5]](s32) + ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C4]](s16) + ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL10]] + ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32) + ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]] + ; GFX9: [[GEP15:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C6]](s32) ; GFX9: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[GEP15]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP16:%[0-9]+]]:_(p5) = G_GEP [[GEP15]], [[C]](s32) ; GFX9: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[GEP16]](p5) :: (load 1, addrspace 5) @@ -9749,16 +10687,19 @@ body: | ; GFX9: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC16]], [[C3]] ; GFX9: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD17]](s32) ; GFX9: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC17]], [[C3]] - ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C4]](s16) - ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL8]] + ; GFX9: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND17]], [[C4]](s16) + ; GFX9: [[OR12:%[0-9]+]]:_(s16) = G_OR [[AND16]], [[SHL12]] ; GFX9: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD18]](s32) ; GFX9: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC18]], [[C3]] ; GFX9: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD19]](s32) ; GFX9: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC19]], [[C3]] - ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C4]](s16) - ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL9]] - ; GFX9: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR8]](s16), [[OR9]](s16) - ; GFX9: [[GEP19:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C6]](s32) + ; GFX9: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[AND19]], [[C4]](s16) + ; GFX9: [[OR13:%[0-9]+]]:_(s16) = G_OR [[AND18]], [[SHL13]] + ; GFX9: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR12]](s16) + ; GFX9: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR13]](s16) + ; GFX9: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C5]](s32) + ; GFX9: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL14]] + ; GFX9: [[GEP19:%[0-9]+]]:_(p5) = G_GEP [[GEP11]], [[C7]](s32) ; GFX9: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[GEP19]](p5) :: (load 1, addrspace 5) ; GFX9: [[GEP20:%[0-9]+]]:_(p5) = G_GEP [[GEP19]], [[C]](s32) ; GFX9: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[GEP20]](p5) :: (load 1, addrspace 5) @@ -9770,18 +10711,21 @@ body: | ; GFX9: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC20]], [[C3]] ; GFX9: [[TRUNC21:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD21]](s32) ; GFX9: [[AND21:%[0-9]+]]:_(s16) = G_AND [[TRUNC21]], [[C3]] - ; GFX9: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C4]](s16) - ; GFX9: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL10]] + ; GFX9: [[SHL15:%[0-9]+]]:_(s16) = G_SHL [[AND21]], [[C4]](s16) + ; GFX9: [[OR15:%[0-9]+]]:_(s16) = G_OR [[AND20]], [[SHL15]] ; GFX9: [[TRUNC22:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD22]](s32) ; GFX9: [[AND22:%[0-9]+]]:_(s16) = G_AND [[TRUNC22]], [[C3]] ; GFX9: [[TRUNC23:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD23]](s32) ; GFX9: [[AND23:%[0-9]+]]:_(s16) = G_AND [[TRUNC23]], [[C3]] - ; GFX9: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C4]](s16) - ; GFX9: [[OR11:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL11]] - ; GFX9: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR10]](s16), [[OR11]](s16) - ; GFX9: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) + ; GFX9: [[SHL16:%[0-9]+]]:_(s16) = G_SHL [[AND23]], [[C4]](s16) + ; GFX9: [[OR16:%[0-9]+]]:_(s16) = G_OR [[AND22]], [[SHL16]] + ; GFX9: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR15]](s16) + ; GFX9: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR16]](s16) + ; GFX9: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C5]](s32) + ; GFX9: [[OR17:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL17]] + ; GFX9: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) %0:_(p5) = COPY $vgpr0 @@ -9801,215 +10745,295 @@ body: | ; SI-LABEL: name: test_extload_private_v2s96_from_24_align2 ; SI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; SI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; SI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; SI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; SI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; SI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; SI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; SI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; SI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; SI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; SI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; SI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; SI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; SI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; SI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; SI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; SI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; SI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; SI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; SI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; SI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; SI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C1]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; SI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C3]](s32) ; SI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; SI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; SI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; SI: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC8]](s16), [[TRUNC9]](s16) - ; SI: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C2]](s32) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; SI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; SI: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C4]](s32) ; SI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; SI: [[GEP10:%[0-9]+]]:_(p5) = G_GEP [[GEP9]], [[C]](s32) ; SI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p5) :: (load 2, addrspace 5) - ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; SI: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) - ; SI: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; SI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; SI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) - ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; SI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; SI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; SI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; SI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; CI-LABEL: name: test_extload_private_v2s96_from_24_align2 ; CI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; CI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; CI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; CI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; CI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; CI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; CI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; CI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; CI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; CI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; CI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; CI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; CI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; CI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; CI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; CI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; CI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; CI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; CI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C1]](s32) + ; CI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; CI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; CI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; CI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; CI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C3]](s32) ; CI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; CI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; CI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; CI: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC8]](s16), [[TRUNC9]](s16) - ; CI: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C2]](s32) + ; CI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; CI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; CI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; CI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; CI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; CI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; CI: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C4]](s32) ; CI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; CI: [[GEP10:%[0-9]+]]:_(p5) = G_GEP [[GEP9]], [[C]](s32) ; CI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p5) :: (load 2, addrspace 5) - ; CI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; CI: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) - ; CI: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; CI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; CI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) - ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; CI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; CI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; CI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; CI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; CI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; CI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; CI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; CI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; VI-LABEL: name: test_extload_private_v2s96_from_24_align2 ; VI: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; VI: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; VI: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; VI: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; VI: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; VI: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; VI: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; VI: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; VI: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; VI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; VI: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; VI: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; VI: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C1]](s32) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; VI: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C3]](s32) ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; VI: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; VI: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC8]](s16), [[TRUNC9]](s16) - ; VI: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C2]](s32) + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; VI: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C4]](s32) ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; VI: [[GEP10:%[0-9]+]]:_(p5) = G_GEP [[GEP9]], [[C]](s32) ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p5) :: (load 2, addrspace 5) - ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; VI: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) - ; VI: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; VI: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; VI: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; VI: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) ; GFX9-LABEL: name: test_extload_private_v2s96_from_24_align2 ; GFX9: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9: [[GEP:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C]](s32) ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C1]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[GEP1:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9: [[GEP2:%[0-9]+]]:_(p5) = G_GEP [[GEP1]], [[C]](s32) ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C2]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[GEP3:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C4]](s32) ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[GEP3]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32) ; GFX9: [[GEP4:%[0-9]+]]:_(p5) = G_GEP [[GEP3]], [[C]](s32) ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[GEP4]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32) - ; GFX9: [[MV2:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC4]](s16), [[TRUNC5]](s16) - ; GFX9: [[MV3:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32), [[MV2]](s32) - ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C3]](s32) + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32) + ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]] + ; GFX9: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[GEP5:%[0-9]+]]:_(p5) = G_GEP [[COPY]], [[C5]](s32) ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[GEP5]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32) ; GFX9: [[GEP6:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C]](s32) ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[GEP6]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32) - ; GFX9: [[MV4:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC6]](s16), [[TRUNC7]](s16) - ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C1]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32) + ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C1]] + ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]] + ; GFX9: [[GEP7:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C3]](s32) ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[GEP7]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32) ; GFX9: [[GEP8:%[0-9]+]]:_(p5) = G_GEP [[GEP7]], [[C]](s32) ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[GEP8]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32) - ; GFX9: [[MV5:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC8]](s16), [[TRUNC9]](s16) - ; GFX9: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C2]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32) + ; GFX9: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C1]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32) + ; GFX9: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C1]] + ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32) + ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]] + ; GFX9: [[GEP9:%[0-9]+]]:_(p5) = G_GEP [[GEP5]], [[C4]](s32) ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[GEP9]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32) ; GFX9: [[GEP10:%[0-9]+]]:_(p5) = G_GEP [[GEP9]], [[C]](s32) ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[GEP10]](p5) :: (load 2, addrspace 5) - ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32) - ; GFX9: [[MV6:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[TRUNC10]](s16), [[TRUNC11]](s16) - ; GFX9: [[MV7:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[MV4]](s32), [[MV5]](s32), [[MV6]](s32) - ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[MV3]](s96) - ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[MV7]](s96) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32) + ; GFX9: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C1]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LOAD11]](s32) + ; GFX9: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C1]] + ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]] + ; GFX9: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s96) = COPY [[MV]](s96) + ; GFX9: [[COPY14:%[0-9]+]]:_(s96) = COPY [[MV1]](s96) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY13]](s96) + ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY14]](s96) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 2, addrspace 5) %2:_(s96) = G_EXTRACT %1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir index 00538251f091b5..2a981be56c41db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir @@ -55,7 +55,16 @@ body: | ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C1]](s32) ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CHECK: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C3]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CHECK: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](p1) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -131,8 +140,12 @@ body: | ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CHECK: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[MV]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) ; CHECK: $vgpr0 = COPY [[COPY3]](s32) %0:_(s8) = G_CONSTANT i8 0 %1:_(s8) = G_CONSTANT i8 1 @@ -169,8 +182,12 @@ body: | ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] - ; CHECK: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16) - ; CHECK: $vgpr0 = COPY [[MV]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C7]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CHECK: $vgpr0 = COPY [[OR2]](s32) %0:_(s8) = G_CONSTANT i8 0 %1:_(s8) = G_CONSTANT i8 1 %2:_(s8) = G_CONSTANT i8 2 @@ -205,11 +222,21 @@ body: | ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32) - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) ; CHECK: $vgpr1_vgpr2 = COPY [[MV]](s64) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -278,8 +305,12 @@ body: | ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C9]](s32) ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CHECK: [[OR5:%[0-9]+]]:_(s16) = G_OR [[OR4]], [[TRUNC7]] - ; CHECK: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR5]](s16) - ; CHECK: [[TRUNC8:%[0-9]+]]:_(s24) = G_TRUNC [[MV]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CHECK: [[TRUNC8:%[0-9]+]]:_(s24) = G_TRUNC [[OR6]](s32) ; CHECK: S_NOP 0, implicit [[TRUNC8]](s24) %0:_(s4) = G_CONSTANT i4 0 %1:_(s4) = G_CONSTANT i4 1 @@ -346,8 +377,12 @@ body: | ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C10]](s32) ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CHECK: [[OR5:%[0-9]+]]:_(s16) = G_OR [[OR4]], [[TRUNC7]] - ; CHECK: [[MV:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[OR2]](s16), [[OR5]](s16) - ; CHECK: [[TRUNC8:%[0-9]+]]:_(s28) = G_TRUNC [[MV]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C11]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CHECK: [[TRUNC8:%[0-9]+]]:_(s28) = G_TRUNC [[OR6]](s32) ; CHECK: S_NOP 0, implicit [[TRUNC8]](s28) %0:_(s4) = G_CONSTANT i4 0 %1:_(s4) = G_CONSTANT i4 1 @@ -442,7 +477,20 @@ body: | ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[COPY10]](s32) ; CHECK: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) ; CHECK: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[TRUNC11]] - ; CHECK: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16), [[OR4]](s16), [[OR5]](s16) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32) + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]] + ; CHECK: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; CHECK: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16) + ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32) + ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]] + ; CHECK: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(s8) = G_CONSTANT i8 0 %1:_(s8) = G_CONSTANT i8 1 @@ -466,13 +514,20 @@ name: test_merge_s96_s16_s16_s16_s16_s16_s16 body: | bb.0: ; CHECK-LABEL: name: test_merge_s96_s16_s16_s16_s16_s16_s16 - ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; CHECK: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; CHECK: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 3 - ; CHECK: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 - ; CHECK: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 - ; CHECK: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C2]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C3]], [[SHL1]] + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C6]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C5]], [[SHL2]] + ; CHECK: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96) %0:_(s16) = G_CONSTANT i16 0 %1:_(s16) = G_CONSTANT i16 1 @@ -531,7 +586,16 @@ body: | ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32) ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) ; CHECK: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[TRUNC7]] - ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s16), [[OR1]](s16), [[OR2]](s16), [[OR3]](s16) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32) ; CHECK: [[TRUNC8:%[0-9]+]]:_(s56) = G_TRUNC [[MV]](s64) ; CHECK: S_NOP 0, implicit [[TRUNC8]](s56) %0:_(s8) = G_CONSTANT i8 0 @@ -706,12 +770,80 @@ name: test_merge_p3_s16_s16 body: | bb.0: ; CHECK-LABEL: name: test_merge_p3_s16_s16 - ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; CHECK: [[MV:%[0-9]+]]:_(p3) = G_MERGE_VALUES [[C]](s16), [[C1]](s16) - ; CHECK: $vgpr0 = COPY [[MV]](p3) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; CHECK: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(s16) = G_CONSTANT i16 0 %1:_(s16) = G_CONSTANT i16 1 %2:_(p3) = G_MERGE_VALUES %0, %1 $vgpr0 = COPY %2 ... + +--- +name: test_merge_s32_s16_s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: test_merge_s32_s16_s16 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: $vgpr0 = COPY [[OR]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s32) = G_MERGE_VALUES %2, %3 + $vgpr0 = COPY %4 +... + +--- +name: test_merge_s48_s16_s16_s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: test_merge_s48_s16_s16_s16 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C]] + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s64) = COPY [[MV]](s64) + ; CHECK: $vgpr0_vgpr1 = COPY [[COPY6]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + + %3:_(s16) = G_TRUNC %0 + %4:_(s16) = G_TRUNC %1 + %5:_(s16) = G_TRUNC %2 + + %6:_(s48) = G_MERGE_VALUES %3, %4, %5 + %7:_(s64) = G_ANYEXT %6 + $vgpr0_vgpr1 = COPY %7 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir index be49cb6817bb64..2bd357c5b846f3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir @@ -216,9 +216,13 @@ body: | ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<2 x s16>), 0 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<2 x s16>), 16 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[EXTRACT]](s16), [[EXTRACT1]](s16) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; CHECK: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -239,9 +243,13 @@ body: | ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; CHECK: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<2 x s16>), 16 - ; CHECK: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<2 x s16>), 0 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[EXTRACT]](s16), [[EXTRACT1]](s16) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) ; CHECK: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll index 25f5d873e9d5c5..e5428803965c51 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 ; PACKED: bb.1 (%ir-block.0): @@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; PACKED: bb.1 (%ir-block.0): @@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -173,8 +173,8 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) - ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; UNPACKED: bb.3: @@ -211,8 +211,8 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) - ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; PACKED: bb.3: @@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 ; PACKED: bb.1 (%ir-block.0): @@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 ; PACKED: bb.1 (%ir-block.0): @@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -312,7 +312,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16 ; PACKED: bb.1 (%ir-block.0): @@ -328,7 +328,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -354,7 +354,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095 ; PACKED: bb.1 (%ir-block.0): @@ -370,7 +370,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -396,7 +396,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096 ; PACKED: bb.1 (%ir-block.0): @@ -412,7 +412,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -459,8 +459,8 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) - ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; UNPACKED: bb.3: @@ -500,8 +500,8 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) - ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; PACKED: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll index 7de9e455442591..ae881bb0a9dc28 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -132,8 +132,8 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -202,7 +202,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -226,7 +226,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -250,7 +250,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -294,8 +294,8 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index 3ed040bdfffcbd..64fd2929a0d727 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -71,8 +71,8 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -102,9 +102,9 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -146,10 +146,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4) + ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -474,8 +474,8 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -537,7 +537,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -559,7 +559,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -581,7 +581,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -600,7 +600,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -618,7 +618,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -639,7 +639,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -661,7 +661,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -683,7 +683,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -722,8 +722,8 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -765,8 +765,8 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir index 721563a41cbea8..213d12ea123c53 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-buffer-load.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s --- name: buffer_load_sss @@ -10,13 +12,20 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 - ; CHECK-LABEL: name: buffer_load_sss - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64-LABEL: name: buffer_load_sss + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32-LABEL: name: buffer_load_sss + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $sgpr5 @@ -32,13 +41,20 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr5 - ; CHECK-LABEL: name: buffer_load_ssv - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr5 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64-LABEL: name: buffer_load_ssv + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32-LABEL: name: buffer_load_ssv + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $vgpr5 @@ -54,13 +70,20 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 - ; CHECK-LABEL: name: buffer_load_svs - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64-LABEL: name: buffer_load_svs + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32-LABEL: name: buffer_load_svs + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $sgpr5 @@ -76,39 +99,72 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $sgpr5 - ; CHECK-LABEL: name: buffer_load_vss - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $sgpr5 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %10, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: + ; WAVE64-LABEL: name: buffer_load_vss + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $sgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE64: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE64: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE64: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE64: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE32-LABEL: name: buffer_load_vss + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $sgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE32: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE32: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE32: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE32: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $sgpr5 @@ -124,38 +180,70 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr5 - ; CHECK-LABEL: name: buffer_load_vvs - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr5 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: + ; WAVE64-LABEL: name: buffer_load_vvs + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE64: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE64: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE64: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE64: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE32-LABEL: name: buffer_load_vvs + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 + ; WAVE32: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE32: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE32: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE32: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 %2:_(s32) = COPY $sgpr5 @@ -171,12 +259,18 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr4, $vgpr5 - ; CHECK-LABEL: name: buffer_load_svv - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr4, $vgpr5 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64-LABEL: name: buffer_load_svv + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr4, $vgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32-LABEL: name: buffer_load_svv + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr4, $vgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[COPY]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr4 %2:_(s32) = COPY $vgpr5 @@ -192,39 +286,72 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr5 - ; CHECK-LABEL: name: buffer_load_vsv - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr5 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %10, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: + ; WAVE64-LABEL: name: buffer_load_vsv + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE64: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE64: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE64: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE64: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE32-LABEL: name: buffer_load_vsv + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; WAVE32: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE32: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE32: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE32: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $vgpr5 @@ -240,38 +367,70 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 - ; CHECK-LABEL: name: buffer_load_vvv - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec - ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec - ; CHECK: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: + ; WAVE64-LABEL: name: buffer_load_vvv + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE64: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE64: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE64: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE64: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE64: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE64: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE32-LABEL: name: buffer_load_vvv + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 + ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; WAVE32: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec + ; WAVE32: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV]](s64), [[UV]](s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub0(s64), implicit $exec + ; WAVE32: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV1]].sub1(s64), implicit $exec + ; WAVE32: [[MV1:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec + ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; WAVE32: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) + ; WAVE32: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), [[COPY2]](s32), 0, 0 + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 %2:_(s32) = COPY $vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir index 8c68b1ef24180e..26f86855a43eb3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir @@ -39,12 +39,12 @@ body: | ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 ; CHECK: [[PHI1:%[0-9]+]]:sgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[COPY]](<4 x s32>), [[V_READFIRSTLANE_B32_]](s32), 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; CHECK: .2: @@ -76,7 +76,7 @@ body: | ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 ; CHECK: [[PHI1:%[0-9]+]]:sgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -89,7 +89,7 @@ body: | ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32), 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; CHECK: .2: @@ -121,7 +121,7 @@ body: | ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 ; CHECK: [[PHI1:%[0-9]+]]:sgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %2(<4 x s32>), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -134,10 +134,10 @@ body: | ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY1]](s32), implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[INT:%[0-9]+]]:sgpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[V_READFIRSTLANE_B32_4]](s32), 0 - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; CHECK: .2: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir new file mode 100644 index 00000000000000..ff851cf91476c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.init.mir @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: ds_gws_init_s_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + ; CHECK-LABEL: name: ds_gws_init_s_s + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), [[COPY2]](s32), [[COPY1]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $sgpr1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), %0, %1 +... + +--- +name: ds_gws_init_s_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $vgpr0 + ; CHECK-LABEL: name: ds_gws_init_s_v + ; CHECK: liveins: $sgpr0, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), [[COPY2]](s32), [[V_READFIRSTLANE_B32_]] + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = COPY $vgpr0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), %0, %1 +... + +--- +name: ds_gws_init_v_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $sgpr0 + ; CHECK-LABEL: name: ds_gws_init_v_s + ; CHECK: liveins: $vgpr0, $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), [[COPY]](s32), [[COPY1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $sgpr0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), %0, %1 +... + +--- +name: ds_gws_init_v_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: ds_gws_init_v_v + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), [[COPY]](s32), [[V_READFIRSTLANE_B32_]] + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.init), %0, %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir new file mode 100644 index 00000000000000..5695b13b440359 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ds.gws.sema.v.mir @@ -0,0 +1,37 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s + +--- +name: ds_gws_init_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + ; CHECK-LABEL: name: ds_gws_init_s + ; CHECK: liveins: $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.v), [[COPY]](s32) + %0:_(s32) = COPY $sgpr0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.v), %0 +... + +--- +name: ds_gws_init_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: ds_gws_init_v + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.v), [[V_READFIRSTLANE_B32_]] + %0:_(s32) = COPY $vgpr0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.v), %0 +... + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll index 5618c1f910a5d7..44a17012237ecb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll @@ -76,7 +76,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF2]], %bb.1, %19, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %19, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -99,7 +99,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) { ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -138,7 +138,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF2]], %bb.1, %20, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %20, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %12(<4 x s32>), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -161,7 +161,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll index 0ea100dc20299e..c59372a8d09c74 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +; XUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s ; Natural mapping define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { @@ -91,7 +91,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -114,7 +114,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr ; CHECK: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32) ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -157,7 +157,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -170,7 +170,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -214,7 +214,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF2]], %bb.1, %24, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF2]], %bb.1, %24, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF1]](<4 x s32>), %bb.1, %17(<4 x s32>), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -248,7 +248,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr ; CHECK: [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32) ; CHECK: [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll index 71014f1ab8114d..33a8e9a1284cc5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s +; XUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s ; Natural mapping define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { @@ -60,7 +60,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.1, %15, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %15, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -73,7 +73,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -104,12 +104,12 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.1, %15, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %15, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -141,7 +141,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.1, %15, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %15, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -154,10 +154,10 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir index eaff3354b98499..b021fb7992b79f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsg.mir @@ -11,11 +11,9 @@ body: | liveins: $sgpr0 ; CHECK-LABEL: name: sendmsg_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), [[C]](s32), [[COPY]](s32) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, [[COPY]](s32) %0:_(s32) = COPY $sgpr0 - %1:_(s32) = G_CONSTANT i32 0 ; FIXME: Should not be a constant - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), %1, %0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, %0 ... --- @@ -27,11 +25,8 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: sendmsg_v ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), [[C]](s32), [[V_READFIRSTLANE_B32_]] + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, [[V_READFIRSTLANE_B32_]] %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_CONSTANT i32 0 ; FIXME: Should not be a constant - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), %1, %0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir index 1ece5a9259be7a..77214b9bb04f08 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.sendmsghalt.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s +# XUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s --- name: sendmsghalt_s @@ -11,11 +11,9 @@ body: | liveins: $sgpr0 ; CHECK-LABEL: name: sendmsghalt_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), [[C]](s32), [[COPY]](s32) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), 0, [[COPY]](s32) %0:_(s32) = COPY $sgpr0 - %1:_(s32) = G_CONSTANT i32 0 ; FIXME: Should not be a constant - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), %1, %0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), 0, %0 ... --- @@ -27,11 +25,8 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: sendmsghalt_v ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), [[C]](s32), [[V_READFIRSTLANE_B32_]] + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), 0, [[V_READFIRSTLANE_B32_]] %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_CONSTANT i32 0 ; FIXME: Should not be a constant - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), %1, %0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsghalt), 0, %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll index 5991e3c8917123..9bc81aecc8a1dc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll @@ -64,7 +64,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.1, %16, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %16, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -77,7 +77,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -109,12 +109,12 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.1, %16, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %16, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -147,7 +147,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.1, %16, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.1, %16, %bb.2 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec @@ -160,10 +160,10 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll index 6b8dd4cfd9941b..efe81eabc34971 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll @@ -65,7 +65,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -77,7 +77,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -108,11 +108,11 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: @@ -144,7 +144,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF]], %bb.1, %14, %bb.2 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub0(s64), implicit $exec ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[UV]].sub1(s64), implicit $exec ; CHECK: [[MV:%[0-9]+]]:sreg_64_xexec(s64) = G_MERGE_VALUES [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32) @@ -156,10 +156,10 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32) ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir new file mode 100644 index 00000000000000..a766df2a3d005e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir @@ -0,0 +1,32 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s + +--- +name: ffbh_u32_s +legalized: true + +body: | + bb.0: + liveins: $sgpr0 + + ; CHECK-LABEL: name: ffbh_u32_s + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s32) + %0:_(s32) = COPY $sgpr0 + %1:_(s32) = G_AMDGPU_FFBH_U32 %0 +... + +--- +name: ffbh_u32_v +legalized: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: ffbh_u32_v + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_AMDGPU_FFBH_U32 %0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir index 6601a181d2494c..45d809b786f11c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-constant.mir @@ -26,12 +26,8 @@ body: | bb.0: ; CHECK-LABEL: name: test_constant_s32_sgpr_use ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), [[C1]](s32), [[C]](s32) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, [[C]](s32) %0:_(s32) = G_CONSTANT i32 1 - - ; FIXME: Should not be a constant - %1:_(s32) = G_CONSTANT i32 0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), %1, %0 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.s.sendmsg), 0, %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir index accfaf08624bbd..2733146187feb4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -1,6 +1,9 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s -# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck -check-prefix=WAVE64 %s +# XUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck -check-prefix=WAVE64 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck -check-prefix=WAVE32 %s +# XUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck -check-prefix=WAVE32 %s + --- name: extract_vector_elt_v16s32_ss @@ -10,12 +13,19 @@ tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 - ; CHECK-LABEL: name: extract_vector_elt_v16s32_ss - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 - ; CHECK: [[EVEC:%[0-9]+]]:sgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) - ; CHECK: $vgpr0 = COPY [[EVEC]](s32) + + ; WAVE64-LABEL: name: extract_vector_elt_v16s32_ss + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; WAVE64: [[EVEC:%[0-9]+]]:sgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) + ; WAVE64: $vgpr0 = COPY [[EVEC]](s32) + ; WAVE32-LABEL: name: extract_vector_elt_v16s32_ss + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; WAVE32: [[EVEC:%[0-9]+]]:sgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) + ; WAVE32: $vgpr0 = COPY [[EVEC]](s32) %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $sgpr16 %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -30,30 +40,53 @@ tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 - ; CHECK-LABEL: name: extract_vector_elt_v16s32_sv - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<16 x s32>) = COPY [[COPY]](<16 x s32>) - ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: $vgpr0 = COPY [[EVEC]](s32) + + ; WAVE64-LABEL: name: extract_vector_elt_v16s32_sv + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE64: $vgpr0 = COPY [[EVEC]](s32) + ; WAVE32-LABEL: name: extract_vector_elt_v16s32_sv + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: + ; WAVE32: $vgpr0 = COPY [[EVEC]](s32) %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -68,12 +101,19 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 - ; CHECK-LABEL: name: extract_vector_elt_v16s32_vs - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) - ; CHECK: $vgpr0 = COPY [[EVEC]](s32) + + ; WAVE64-LABEL: name: extract_vector_elt_v16s32_vs + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) + ; WAVE64: $vgpr0 = COPY [[EVEC]](s32) + ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vs + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[COPY1]](s32) + ; WAVE32: $vgpr0 = COPY [[EVEC]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $sgpr0 %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -88,35 +128,210 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 - ; CHECK-LABEL: name: extract_vector_elt_v16s32_vv - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: $vgpr0 = COPY [[EVEC]](s32) + + ; WAVE64-LABEL: name: extract_vector_elt_v16s32_vv + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE64: $vgpr0 = COPY [[EVEC]](s32) + ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: + ; WAVE32: $vgpr0 = COPY [[EVEC]](s32) %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1 $vgpr0 = COPY %2 ... +--- +name: extract_vector_elt_v8s64_ss +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + + ; WAVE64-LABEL: name: extract_vector_elt_v8s64_ss + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; WAVE64: [[EVEC:%[0-9]+]]:sgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[COPY1]](s32) + ; WAVE64: $sgpr0_sgpr1 = COPY [[EVEC]](s64) + ; WAVE32-LABEL: name: extract_vector_elt_v8s64_ss + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; WAVE32: [[EVEC:%[0-9]+]]:sgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[COPY1]](s32) + ; WAVE32: $sgpr0_sgpr1 = COPY [[EVEC]](s64) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s32) = COPY $sgpr16 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $sgpr0_sgpr1 = COPY %2 +... + +--- +name: extract_vector_elt_v8s64_vs +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + + ; WAVE64-LABEL: name: extract_vector_elt_v8s64_vs + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY1]], [[C]](s32) + ; WAVE64: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; WAVE32-LABEL: name: extract_vector_elt_v8s64_vs + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY1]], [[C]](s32) + ; WAVE32: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s32) = COPY $sgpr0 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: extract_vector_elt_v8s64_sv +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + + ; WAVE64-LABEL: name: extract_vector_elt_v8s64_sv + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; WAVE64: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; WAVE64: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; WAVE32-LABEL: name: extract_vector_elt_v8s64_sv + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; WAVE32: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; WAVE32: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: + ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s32) = COPY $vgpr0 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + --- name: extract_vector_elt_v8s64_vv legalized: true @@ -125,29 +340,77 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 - ; CHECK-LABEL: name: extract_vector_elt_v8s64_vv - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 - ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(s64) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s64) = G_PHI [[DEF]](s64), %bb.0, %2(s64), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[V_READFIRSTLANE_B32_]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: $vgpr0_vgpr1 = COPY [[EVEC]](s64) + + ; WAVE64-LABEL: name: extract_vector_elt_v8s64_vv + ; WAVE64: successors: %bb.1(0x80000000) + ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 + ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; WAVE64: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE64: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; WAVE64: .1: + ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; WAVE64: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 + ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; WAVE64: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE64: .2: + ; WAVE64: successors: %bb.3(0x80000000) + ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; WAVE64: .3: + ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; WAVE32-LABEL: name: extract_vector_elt_v8s64_vv + ; WAVE32: successors: %bb.1(0x80000000) + ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 + ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; WAVE32: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; WAVE32: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo + ; WAVE32: .1: + ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; WAVE32: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 + ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; WAVE32: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; WAVE32: .2: + ; WAVE32: successors: %bb.3(0x80000000) + ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]] + ; WAVE32: .3: + ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir index a3e7d7423b3826..47ebf3488d2091 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir @@ -56,8 +56,7 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 @@ -69,17 +68,35 @@ body: | --- name: insert_vector_elt_v4i32_s_s_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 @@ -91,17 +108,35 @@ body: | --- name: insert_vector_elt_v4i32_s_v_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 @@ -113,16 +148,35 @@ body: | --- name: insert_vector_elt_var_v4i32_v_s_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr4 @@ -134,12 +188,14 @@ body: | --- name: insert_vector_elt_var_v4i32_v_v_s legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_v_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 @@ -155,16 +211,35 @@ body: | --- name: insert_vector_elt_var_v4i32_v_v_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 @@ -172,3 +247,299 @@ body: | %3:_(<4 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 ... + +--- +name: insert_vector_elt_v8s64_s_s_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_s_s + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[IVEC:%[0-9]+]]:sgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[COPY2]](s32) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_s_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr16_sgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_s_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr16_sgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[COPY2]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_v_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $sgpr16 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_v_s + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $sgpr16 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s32) = COPY $sgpr16 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_s_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $vgpr0 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_v_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF5]], %bb.0, %25, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 + ; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_v_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $vgpr16_vgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_s_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $sgpr0_sgpr1 + %2:_(s32) = COPY $vgpr16 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_v_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr18 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF5]], %bb.0, %25, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 + ; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $vgpr16_vgpr17 + %2:_(s32) = COPY $vgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 2c8864ca5be911..9bf75371e339e8 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s @@ -203,8 +205,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9: buffer_store_dwordx4 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: flat_load_dword v[[A:[0-9]+]] -; VI: flat_load_dword v[[B:[0-9]+]] +; VI-DAG: flat_load_dword v[[A:[0-9]+]] +; VI-DAG: flat_load_dword v[[B:[0-9]+]] ; VI-DAG: v_add_u16_e32 ; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll index bd430be7172f88..6f8b67e8e3239f 100644 --- a/llvm/test/CodeGen/AMDGPU/add3.ll +++ b/llvm/test/CodeGen/AMDGPU/add3.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s @@ -245,12 +247,12 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float ; ; GFX10-LABEL: add3_uniform_vgpr: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0 -; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0 -; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4 +; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %a1 = fadd float %a, 1.0 %b2 = fadd float %b, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 84a1172d1c42b6..913906deaebebd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s @@ -148,7 +150,6 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1064: s_or_saveexec_b64 s[4:5], -1 ; GFX1064: v_mov_b32_e32 v3, v1 ; GFX1064: v_mov_b32_e32 v4, v1 -; GFX1064: s_mov_b32 s2, -1 ; GFX1064: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064: v_add_nc_u32_e32 v2, v2, v3 ; GFX1064: v_mov_b32_e32 v3, v1 @@ -165,17 +166,18 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064: v_add_nc_u32_e32 v2, v2, v4 ; GFX1064: v_mov_b32_e32 v4, v1 -; GFX1064: v_readlane_b32 s3, v2, 31 -; GFX1064: v_mov_b32_e32 v3, s3 +; GFX1064: v_readlane_b32 s2, v2, 31 +; GFX1064: v_mov_b32_e32 v3, s2 ; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064: v_add_nc_u32_e32 v2, v2, v4 ; GFX1064: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064: v_readlane_b32 s3, v2, 15 -; GFX1064: v_readlane_b32 s6, v2, 31 -; GFX1064: v_writelane_b32 v1, s3, 16 -; GFX1064: v_readlane_b32 s3, v2, 63 -; GFX1064: v_writelane_b32 v1, s6, 32 +; GFX1064: v_readlane_b32 s2, v2, 15 +; GFX1064: v_readlane_b32 s3, v2, 31 ; GFX1064: v_readlane_b32 s6, v2, 47 +; GFX1064: v_writelane_b32 v1, s2, 16 +; GFX1064: s_mov_b32 s2, -1 +; GFX1064: v_writelane_b32 v1, s3, 32 +; GFX1064: v_readlane_b32 s3, v2, 63 ; GFX1064: v_writelane_b32 v1, s6, 48 ; GFX1064: s_mov_b64 exec, s[4:5] ; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll index c05f29622c7463..175be85005366d 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll @@ -1,4 +1,7 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. +; REQUIRES: disabled +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -enable-misched=false -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s ; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 40bb9be033f8ee..e346b98c272259 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,SI ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,FLAT,TONGA @@ -391,13 +393,13 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s6, 0xcccccccc -; SI-NEXT: s_mov_b32 s8, 0x55555555 -; SI-NEXT: s_mov_b32 s9, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s6, 0xff00 +; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s10, 0x33333333 +; SI-NEXT: s_mov_b32 s11, 0xcccccccc +; SI-NEXT: s_mov_b32 s0, 0x55555555 +; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 8 ; SI-NEXT: v_alignbit_b32 v4, v1, v0, 24 @@ -410,36 +412,36 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; SI-NEXT: v_and_b32_e32 v0, 0xff0000, v0 ; SI-NEXT: v_and_b32_e32 v4, 0xff0000, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff000000, v5 -; SI-NEXT: v_and_b32_e32 v7, s0, v7 +; SI-NEXT: v_and_b32_e32 v7, s6, v7 +; SI-NEXT: v_and_b32_e32 v2, s6, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v2, s0, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v5, v7, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_or_b32_e32 v1, v4, v5 ; SI-NEXT: v_or_b32_e32 v3, v0, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v1 -; SI-NEXT: v_and_b32_e32 v2, s2, v1 -; SI-NEXT: v_and_b32_e32 v1, s1, v3 -; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s8, v1 +; SI-NEXT: v_and_b32_e32 v2, s9, v1 +; SI-NEXT: v_and_b32_e32 v1, s8, v3 +; SI-NEXT: v_and_b32_e32 v3, s9, v3 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 +; SI-NEXT: v_and_b32_e32 v1, s10, v3 +; SI-NEXT: v_and_b32_e32 v0, s10, v2 +; SI-NEXT: v_and_b32_e32 v3, s11, v3 +; SI-NEXT: v_and_b32_e32 v2, s11, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s8, v3 -; SI-NEXT: v_and_b32_e32 v0, s8, v2 -; SI-NEXT: v_and_b32_e32 v3, s9, v3 -; SI-NEXT: v_and_b32_e32 v2, s9, v2 +; SI-NEXT: v_and_b32_e32 v1, s0, v3 +; SI-NEXT: v_and_b32_e32 v0, s0, v2 +; SI-NEXT: v_and_b32_e32 v3, s1, v3 +; SI-NEXT: v_and_b32_e32 v2, s1, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 @@ -452,60 +454,60 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: v_mov_b32_e32 v4, 8 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f0 +; FLAT-NEXT: v_mov_b32_e32 v2, 8 +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x33333333 -; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s6, 0x55555555 -; FLAT-NEXT: s_mov_b32 s8, 0xaaaaaaaa +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_lshlrev_b64 v[2:3], 24, v[0:1] -; FLAT-NEXT: v_alignbit_b32 v2, v1, v0, 24 -; FLAT-NEXT: v_alignbit_b32 v6, v1, v0, 8 -; FLAT-NEXT: v_lshlrev_b32_sdwa v7, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 8, v[0:1] -; FLAT-NEXT: v_lshlrev_b32_e32 v4, 24, v0 +; FLAT-NEXT: v_lshlrev_b64 v[4:5], 24, v[0:1] +; FLAT-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; FLAT-NEXT: v_lshlrev_b64 v[2:3], 8, v[0:1] +; FLAT-NEXT: v_alignbit_b32 v6, v1, v0, 24 +; FLAT-NEXT: v_alignbit_b32 v7, v1, v0, 8 +; FLAT-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; FLAT-NEXT: v_and_b32_e32 v2, 0xff0000, v2 -; FLAT-NEXT: v_and_b32_e32 v6, 0xff000000, v6 ; FLAT-NEXT: v_and_b32_e32 v0, 0xff0000, v0 -; FLAT-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; FLAT-NEXT: v_or_b32_e32 v2, v6, v2 -; FLAT-NEXT: v_and_b32_e32 v3, 0xff00, v3 -; FLAT-NEXT: v_or_b32_e32 v1, v2, v1 -; FLAT-NEXT: v_or_b32_e32 v0, v4, v0 -; FLAT-NEXT: v_or_b32_sdwa v2, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FLAT-NEXT: v_and_b32_e32 v4, 0xff0000, v6 +; FLAT-NEXT: v_and_b32_e32 v6, 0xff000000, v7 +; FLAT-NEXT: v_and_b32_e32 v5, 0xff00, v5 +; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 +; FLAT-NEXT: v_or_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FLAT-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; FLAT-NEXT: v_or_b32_e32 v4, v6, v4 +; FLAT-NEXT: v_or_b32_e32 v1, v4, v1 ; FLAT-NEXT: v_or_b32_e32 v3, v0, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v1 -; FLAT-NEXT: v_and_b32_e32 v2, s3, v1 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v1 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v1 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] +; FLAT-NEXT: s_mov_b32 s0, 0x55555555 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] +; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s6, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s6, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s8, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -778,23 +780,23 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xff00 +; SI-NEXT: s_mov_b32 s5, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s6, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s7, 0x33333333 ; SI-NEXT: s_mov_b32 s8, 0xcccccccc ; SI-NEXT: s_mov_b32 s9, 0x55555555 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], 8 ; SI-NEXT: v_alignbit_b32 v6, v3, v2, 24 @@ -819,7 +821,7 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_mov_b32_e32 v7, 0xff00 ; SI-NEXT: v_and_b32_e32 v2, v0, v11 ; SI-NEXT: v_and_b32_e32 v11, v0, v12 -; SI-NEXT: v_and_b32_e32 v9, s0, v9 +; SI-NEXT: v_and_b32_e32 v9, s4, v9 ; SI-NEXT: v_and_b32_e32 v12, 0xff000000, v13 ; SI-NEXT: v_and_b32_e32 v0, v0, v17 ; SI-NEXT: v_and_b32_e32 v13, v7, v15 @@ -828,7 +830,7 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v2, v10, v2 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v4, s0, v4 +; SI-NEXT: v_and_b32_e32 v4, s4, v4 ; SI-NEXT: v_or_b32_e32 v7, v16, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v9, v12, v11 @@ -838,14 +840,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v1 ; SI-NEXT: v_or_b32_e32 v3, v2, v0 -; SI-NEXT: v_and_b32_e32 v0, s1, v6 -; SI-NEXT: v_and_b32_e32 v2, s2, v6 -; SI-NEXT: v_and_b32_e32 v4, s1, v5 -; SI-NEXT: v_and_b32_e32 v6, s2, v5 -; SI-NEXT: v_and_b32_e32 v5, s1, v7 -; SI-NEXT: v_and_b32_e32 v7, s2, v7 -; SI-NEXT: v_and_b32_e32 v1, s1, v3 -; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s5, v6 +; SI-NEXT: v_and_b32_e32 v2, s6, v6 +; SI-NEXT: v_and_b32_e32 v4, s5, v5 +; SI-NEXT: v_and_b32_e32 v6, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, s5, v7 +; SI-NEXT: v_and_b32_e32 v7, s6, v7 +; SI-NEXT: v_and_b32_e32 v1, s5, v3 +; SI-NEXT: v_and_b32_e32 v3, s6, v3 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 @@ -854,12 +856,12 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_or_b32_e32 v6, v6, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v5, s3, v7 -; SI-NEXT: v_and_b32_e32 v4, s3, v6 +; SI-NEXT: v_and_b32_e32 v5, s7, v7 +; SI-NEXT: v_and_b32_e32 v4, s7, v6 ; SI-NEXT: v_and_b32_e32 v7, s8, v7 ; SI-NEXT: v_and_b32_e32 v6, s8, v6 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 +; SI-NEXT: v_and_b32_e32 v1, s7, v3 +; SI-NEXT: v_and_b32_e32 v0, s7, v2 ; SI-NEXT: v_and_b32_e32 v3, s8, v3 ; SI-NEXT: v_and_b32_e32 v2, s8, v2 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 @@ -886,7 +888,8 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v1, v5, v8 ; SI-NEXT: v_or_b32_e32 v0, v4, v7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i64: @@ -895,63 +898,63 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: v_mov_b32_e32 v8, 8 -; FLAT-NEXT: v_mov_b32_e32 v10, 0xff0000 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s1, 0x33333333 -; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 ; FLAT-NEXT: s_mov_b32 s8, 0x55555555 ; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 24, v[2:3] +; FLAT-NEXT: v_lshlrev_b64 v[6:7], 24, v[2:3] ; FLAT-NEXT: v_lshlrev_b32_sdwa v12, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; FLAT-NEXT: v_lshlrev_b32_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; FLAT-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; FLAT-NEXT: v_lshlrev_b64 v[8:9], 8, v[0:1] -; FLAT-NEXT: v_lshlrev_b64 v[6:7], 8, v[2:3] -; FLAT-NEXT: v_alignbit_b32 v4, v3, v2, 24 +; FLAT-NEXT: v_alignbit_b32 v10, v3, v2, 24 ; FLAT-NEXT: v_alignbit_b32 v11, v3, v2, 8 +; FLAT-NEXT: v_lshlrev_b64 v[4:5], 8, v[2:3] +; FLAT-NEXT: v_alignbit_b32 v13, v1, v0, 8 ; FLAT-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; FLAT-NEXT: v_or_b32_sdwa v12, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; FLAT-NEXT: v_alignbit_b32 v13, v1, v0, 24 -; FLAT-NEXT: v_alignbit_b32 v14, v1, v0, 8 +; FLAT-NEXT: v_alignbit_b32 v6, v1, v0, 24 ; FLAT-NEXT: v_lshlrev_b32_e32 v8, 24, v0 ; FLAT-NEXT: v_lshlrev_b32_e32 v15, 8, v0 +; FLAT-NEXT: v_or_b32_sdwa v12, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 24, v[0:1] -; FLAT-NEXT: v_lshlrev_b32_e32 v6, 24, v2 +; FLAT-NEXT: v_lshlrev_b32_e32 v4, 24, v2 +; FLAT-NEXT: v_mov_b32_e32 v0, 0xff0000 ; FLAT-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; FLAT-NEXT: v_and_b32_e32 v0, 0xff0000, v4 -; FLAT-NEXT: v_and_b32_e32 v4, 0xff000000, v11 -; FLAT-NEXT: v_and_b32_e32 v2, v10, v2 -; FLAT-NEXT: v_and_b32_e32 v11, v10, v13 -; FLAT-NEXT: v_or_b32_e32 v0, v4, v0 +; FLAT-NEXT: v_and_b32_e32 v2, v0, v2 ; FLAT-NEXT: v_and_b32_e32 v1, 0xff00, v1 -; FLAT-NEXT: v_and_b32_e32 v13, 0xff000000, v14 -; FLAT-NEXT: v_and_b32_e32 v4, 0xff00, v5 -; FLAT-NEXT: v_and_b32_e32 v10, v10, v15 -; FLAT-NEXT: v_or_b32_e32 v5, v13, v11 -; FLAT-NEXT: v_or_b32_e32 v2, v6, v2 -; FLAT-NEXT: v_or_b32_e32 v3, v0, v3 -; FLAT-NEXT: v_or_b32_sdwa v0, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; FLAT-NEXT: v_or_b32_e32 v6, v8, v10 +; FLAT-NEXT: v_and_b32_e32 v6, v0, v6 +; FLAT-NEXT: v_and_b32_e32 v10, 0xff0000, v10 +; FLAT-NEXT: v_and_b32_e32 v11, 0xff000000, v11 +; FLAT-NEXT: v_and_b32_e32 v13, 0xff000000, v13 +; FLAT-NEXT: v_and_b32_e32 v0, v0, v15 +; FLAT-NEXT: v_and_b32_e32 v7, 0xff00, v7 +; FLAT-NEXT: v_or_b32_e32 v10, v11, v10 +; FLAT-NEXT: v_or_b32_e32 v2, v4, v2 +; FLAT-NEXT: v_or_b32_e32 v4, v13, v6 +; FLAT-NEXT: v_or_b32_e32 v6, v8, v0 +; FLAT-NEXT: v_or_b32_sdwa v0, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; FLAT-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; FLAT-NEXT: v_or_b32_e32 v7, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v5, v5, v12 +; FLAT-NEXT: v_or_b32_e32 v5, v4, v12 ; FLAT-NEXT: v_or_b32_e32 v8, v6, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v3, s0, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s2, v5 -; FLAT-NEXT: v_and_b32_e32 v6, s0, v5 -; FLAT-NEXT: v_and_b32_e32 v5, s2, v8 -; FLAT-NEXT: v_and_b32_e32 v7, s0, v8 +; FLAT-NEXT: v_or_b32_e32 v7, v2, v0 +; FLAT-NEXT: v_or_b32_e32 v3, v10, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v3 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v7 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v3 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s0, v5 +; FLAT-NEXT: v_and_b32_e32 v6, s1, v5 +; FLAT-NEXT: v_and_b32_e32 v5, s0, v8 +; FLAT-NEXT: v_and_b32_e32 v7, s1, v8 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] @@ -960,10 +963,10 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s1, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 ; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 ; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 ; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir index 323396795dd460..4ac48b1133aad5 100644 --- a/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -393,12 +393,12 @@ name: trivial_clause_load_mubuf4_x2 body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -407,13 +407,13 @@ name: break_clause_simple_load_mubuf_offen_ptr body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -424,11 +424,11 @@ name: mubuf_load4_overwrite_ptr body: | bb.0: ; GCN-LABEL: name: mubuf_load4_overwrite_ptr - ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec S_ENDPGM 0 @@ -443,11 +443,11 @@ body: | ; GCN-LABEL: name: break_clause_flat_load_mubuf_load ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... # Break a clause from interference between mubuf and flat instructions @@ -462,7 +462,7 @@ name: break_clause_mubuf_load_flat_load body: | bb.0: - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -504,12 +504,12 @@ name: break_clause_atomic_rtn_into_ptr_mubuf4 body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -521,11 +521,11 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4 ; GCN: BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -536,11 +536,11 @@ name: no_break_clause_mubuf_load_novaddr body: | bb.0: ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- diff --git a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir index 16021126d1ea0d..f631bcd258115a 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir +++ b/llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -55,10 +55,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -117,10 +117,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -180,10 +180,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -245,10 +245,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -322,10 +322,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,10 +387,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir b/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir index 7839d514a1443d..599cacb8261552 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir @@ -30,7 +30,7 @@ body: | %14:vgpr_32 = V_AND_B32_e32 1, %13, implicit $exec %15:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %14, implicit $exec %16:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %15, implicit $exec - BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) S_ENDPGM 0 bb.2: @@ -78,7 +78,7 @@ body: | bb.8: successors: %bb.10 - %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %34:sreg_64_xexec = V_CMP_NE_U32_e64 0, %31, implicit $exec %35:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %34, implicit $exec %28:vgpr_32 = COPY %35 diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir index d7d8b41f68330a..bc549f7bb87b4f 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir @@ -83,7 +83,7 @@ body: | bb.9: successors: %bb.10(0x80000000) - %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %21:sreg_64 = V_CMP_NE_U32_e64 target-flags(amdgpu-gotprel) 0, killed %19.sub0, implicit $exec %22:sreg_64 = COPY $exec, implicit-def $exec %23:sreg_64 = S_AND_B64 %22, %21, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir index 4dffe32f9b180f..67399883ae07d5 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir @@ -68,7 +68,7 @@ body: | %23:vreg_128 = COPY killed %17 %24:sreg_64 = COPY killed %16 %25:vgpr_32 = V_OR_B32_e32 %22, %11, implicit $exec - %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %28:vgpr_32 = V_LSHRREV_B32_e32 30, killed %26.sub0, implicit $exec %29:vreg_128 = COPY killed %21 %29.sub0:vreg_128 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir index eb3d6169e97621..773466af7adb29 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir @@ -11,7 +11,7 @@ # # GCN-LABEL: bb.6: # GCN: successors: %bb.7(0x{{[0-9]+}}), %bb.18(0x{{[0-9]+}}) -# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, 0, implicit $exec # --- | @@ -69,7 +69,7 @@ body: | %10:sreg_64 = COPY killed %5 undef %11.sub2:sreg_128 = COPY %4 %11.sub3:sreg_128 = COPY %3 - %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, implicit $exec + %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, implicit $exec undef %13.sub1:vreg_128 = COPY %9.sub1 %13.sub2:vreg_128 = COPY %9.sub2 %14:sreg_64 = V_CMP_GT_F32_e64 0, target-flags(amdgpu-rel32-lo) 0, 0, killed %12.sub3, 0, implicit $exec @@ -161,7 +161,7 @@ body: | bb.18: successors: %bb.7(0x80000000) dead %59:vgpr_32 = V_FMA_F32 0, killed %9.sub2, 0, undef %60:vgpr_32, 0, undef %61:vgpr_32, 0, 0, implicit $exec - dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sreg_128, undef %65:sreg_32, 0, 0, 0, 0, 0, implicit $exec + dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sreg_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, implicit $exec undef %66.sub1:vreg_128 = COPY %13.sub1 %66.sub2:vreg_128 = COPY %13.sub2 %67:sreg_64 = V_CMP_NGT_F32_e64 0, 0, 0, undef %68:vgpr_32, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir b/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir index a01f1e71dacef1..4c532e89398e15 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir @@ -148,7 +148,7 @@ body: | %43:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %44:sreg_128, 12, 0, 0 :: (dereferenceable invariant load 4) %45:vgpr_32 = V_MUL_LO_I32 killed %42, killed %43, implicit $exec %46:vgpr_32 = V_LSHLREV_B32_e32 2, killed %45, implicit $exec - %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %49:sreg_64 = V_CMP_NE_U32_e64 0, killed %47, implicit $exec %50:sreg_64 = COPY $exec, implicit-def $exec %51:sreg_64 = S_AND_B64 %50, %49, implicit-def dead $scc diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir index 708814e3df4584..1a26a507cd9bfd 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -33,7 +33,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -44,7 +44,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: DBG_VALUE @@ -80,7 +80,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -92,7 +92,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -141,7 +141,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -152,7 +152,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: bb.4: @@ -188,7 +188,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -200,7 +200,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -249,7 +249,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -260,7 +260,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: bb.4: @@ -297,7 +297,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -309,7 +309,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -358,7 +358,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -370,7 +370,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF @@ -408,7 +408,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -420,7 +420,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: %15:sgpr_32 = IMPLICIT_DEF @@ -471,7 +471,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -482,7 +482,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF @@ -520,7 +520,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -532,7 +532,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -583,7 +583,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -595,7 +595,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -631,7 +631,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -643,7 +643,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -691,7 +691,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -703,7 +703,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -739,7 +739,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -751,7 +751,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -799,7 +799,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -811,7 +811,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.5(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -850,7 +850,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -862,7 +862,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir index 44a8e38a5655b8..9219083bb64cec 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir @@ -42,7 +42,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -54,7 +54,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -91,7 +91,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -103,7 +103,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index e5ff97a7be3de4..92e29f3a529099 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -54,7 +54,7 @@ body: | %8 = S_MOV_B32 9999 %9 = S_AND_B32 killed %7, killed %8, implicit-def dead $scc %10 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -219,7 +219,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHL_B32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -419,7 +419,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_ASHR_I32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -627,7 +627,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHR_B32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index f00151942693bf..b3edc846ef8b7a 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s @@ -8,37 +10,37 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -72,16 +74,16 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 @@ -102,25 +104,25 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, 0 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s8, s4 ; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -160,61 +162,60 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 -; SI-NEXT: s_mov_b32 s19, s15 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s16, s2 -; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: s_mov_b32 s6, s14 -; SI-NEXT: s_mov_b32 s7, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s16, s6 +; SI-NEXT: s_mov_b32 s17, s7 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: s_mov_b32 s0, s10 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_add_u32_e32 v0, vcc, s12, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s1, s11 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -230,77 +231,76 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0 ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_movk_i32 s1, 0xff -; SI-NEXT: s_movk_i32 s2, 0x900 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s12, 0xff00 +; SI-NEXT: s_movk_i32 s13, 0xff +; SI-NEXT: s_movk_i32 s14, 0x900 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: v_and_b32_e32 v2, s0, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v0, s1, v0 -; SI-NEXT: v_and_b32_e32 v3, s0, v1 +; SI-NEXT: v_and_b32_e32 v4, s12, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s1, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, s12, v0 +; SI-NEXT: v_and_b32_e32 v3, s13, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, s13, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s14, v2 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_movk_i32 s10, 0xff00 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_movk_i32 s8, 0xff +; VI-NEXT: s_movk_i32 s9, 0x900 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_movk_i32 s8, 0xff ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_movk_i32 s9, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s10, v1 +; VI-NEXT: v_and_b32_e32 v4, s10, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 +; VI-NEXT: v_add_u16_e32 v3, 9, v0 ; VI-NEXT: v_and_b32_e32 v1, s8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_and_b32_e32 v2, s10, v0 -; VI-NEXT: v_add_u16_e32 v0, 9, v0 -; VI-NEXT: v_and_b32_e32 v0, s8, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_and_b32_e32 v3, s8, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_add_u16_e32 v1, s9, v1 -; VI-NEXT: v_add_u16_e32 v0, s9, v0 +; VI-NEXT: v_add_u16_e32 v2, s9, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -317,42 +317,42 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, 0 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s16, 0xff00 -; SI-NEXT: s_movk_i32 s17, 0xff -; SI-NEXT: s_movk_i32 s18, 0x900 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 ; SI-NEXT: s_mov_b32 s8, s4 ; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s4, 0xff00 +; SI-NEXT: s_movk_i32 s5, 0xff +; SI-NEXT: s_movk_i32 s6, 0x900 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s16, v1 +; SI-NEXT: v_and_b32_e32 v4, s4, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_and_b32_e32 v2, s16, v0 -; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: v_and_b32_e32 v2, s4, v0 +; SI-NEXT: v_and_b32_e32 v3, s5, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s17, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s18, v2 +; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -360,39 +360,41 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s14, 0xff00 -; VI-NEXT: s_movk_i32 s12, 0xff -; VI-NEXT: s_movk_i32 s13, 0x900 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_movk_i32 s6, 0xff00 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_movk_i32 s4, 0xff +; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_movk_i32 s5, 0x900 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s14, v1 +; VI-NEXT: v_and_b32_e32 v4, s6, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s12, v1 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 -; VI-NEXT: v_and_b32_e32 v2, s14, v0 -; VI-NEXT: v_and_b32_e32 v3, s12, v3 +; VI-NEXT: v_and_b32_e32 v2, s6, v0 +; VI-NEXT: v_and_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s13, v1 -; VI-NEXT: v_add_u16_e32 v2, s13, v2 +; VI-NEXT: v_add_u16_e32 v1, s5, v1 +; VI-NEXT: v_add_u16_e32 v2, s5, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -407,41 +409,41 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x diff --git a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir index 4679831c786d1a..bba41584bc97fa 100644 --- a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir +++ b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir @@ -291,7 +291,7 @@ body: | bb.3..lr.ph3410.preheader: successors: %bb.4(0x80000000) - dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) dead %60:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec %36:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc dead %67:vgpr_32 = V_MOV_B32_e32 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index dae8402a1ae51f..b019c2e810bca5 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,SI ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=FUNC,GCN,VI @@ -65,21 +67,22 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32: @@ -131,16 +134,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v2, v1 ; SI-NEXT: v_ffbh_u32_e32 v3, v0 @@ -148,7 +151,8 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v2i32: @@ -206,16 +210,16 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v2 @@ -229,7 +233,8 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v4i32: @@ -299,9 +304,9 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +506,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -509,7 +513,7 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v2 ; SI-NEXT: v_ffbh_u32_e32 v5, v3 @@ -520,7 +524,8 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64: @@ -588,7 +593,6 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -596,8 +600,8 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v4 @@ -607,7 +611,8 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 ; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; SI-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64_trunc: @@ -615,26 +620,26 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v3, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2] +; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v4, v0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 +; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 ; VI-NEXT: v_ffbh_u32_e32 v5, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i64_trunc: @@ -676,19 +681,20 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -742,19 +748,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -809,23 +816,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -885,23 +893,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -961,18 +970,19 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1030,9 +1040,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,19 +1112,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 4f693e204b6323..5627cd3025581c 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=GCN,SI ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=GCN,VI @@ -266,20 +268,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1) define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_movk_i32 s12, 0xff -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_movk_i32 s13, 0x900 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 @@ -292,8 +294,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6 ; SI-NEXT: v_and_b32_e32 v7, s12, v7 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v6, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 @@ -304,44 +305,44 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_mov_b32_e32 v4, 9 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_movk_i32 s8, 0x900 -; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v4, 9 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_movk_i32 s0, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v5 -; VI-NEXT: v_add_u16_e32 v9, 9, v5 +; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5 +; VI-NEXT: v_add_u16_e32 v8, 9, v5 ; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 +; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u16_e32 v0, s0, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -365,32 +366,33 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v3, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 @@ -415,35 +417,32 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v11, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 6, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[8:9] +; VI-NEXT: flat_load_ubyte v1, v[12:13] +; VI-NEXT: flat_load_ubyte v10, v[10:11] +; VI-NEXT: flat_load_ubyte v8, v[8:9] ; VI-NEXT: flat_load_ubyte v7, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_sdwa v1, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v4, v4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; VI-NEXT: v_or_b32_e32 v4, v4, v10 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 @@ -906,42 +905,42 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; SI-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll index a39833455a153d..70e5df5788aeef 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { ; GCN: [[DEF:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; GCN: [[DEF1:%[0-9]+]]:sreg_128 = IMPLICIT_DEF - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 @@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { ; GCN: [[DEF2:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] ; GCN: [[DEF3:%[0-9]+]]:sreg_128 = IMPLICIT_DEF - ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; GCN: S_ENDPGM 0 main_body: %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 2945d7ddef0dbc..5ed103f94374d6 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s @@ -344,8 +346,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa } ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16: -; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} ; VI-NOT: v_and_b32 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}} diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll index 117307968502dd..7d4df36729782a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -check-prefix=VI %s @@ -7,23 +9,23 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_uge_f64: @@ -59,23 +61,23 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_oge_f64: @@ -111,23 +113,23 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ugt_f64: @@ -163,23 +165,23 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ogt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll index 306b4c25fb9253..5a233c2c99d923 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -check-prefix=VI %s @@ -5,23 +7,23 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_uge_f64: @@ -57,23 +59,23 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ugt_f64: @@ -109,23 +111,23 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ule_f64: @@ -161,23 +163,23 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ult_f64: @@ -213,23 +215,23 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_oge_f64: @@ -265,23 +267,23 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ogt_f64: @@ -317,23 +319,23 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ole_f64: @@ -369,23 +371,23 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, d define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_olt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 0ab2e6710c3b87..248cbe6ab5cc75 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -331,7 +331,8 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] +; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -439,7 +440,8 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocap ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] +; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index 5dda92dbd5ecd8..6ba76661440286 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s @@ -115,7 +117,7 @@ define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -139,7 +141,7 @@ define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -157,9 +159,9 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl } ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: -; GCN-SAFE: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} -; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} +; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]] ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] @@ -312,7 +314,7 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -332,7 +334,7 @@ define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -352,7 +354,7 @@ define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1863,7 +1865,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addr ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1883,7 +1885,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1903,7 +1905,7 @@ define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index aa52c3d0b4e955..4747b829b7a8c6 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s @@ -40,7 +42,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, ; unless isFabsFree returns true ; GCN-LABEL: {{^}}fneg_fabs_free_f16: -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 +; GCN: {{s_or_b32 s[0-9]+, s[0-9]+, 0x8000|s_bitset1_b32 s[0-9]+, 15}} define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -50,7 +52,7 @@ define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) } ; GCN-LABEL: {{^}}fneg_fabs_f16: -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 +; GCN: {{s_or_b32 s[0-9]+, s[0-9]+, 0x8000|s_bitset1_b32 s[0-9]+, 15}} define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.0, %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir index a015a1ef4d1134..f80176508befea 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir @@ -23,13 +23,13 @@ body: | ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %1:sreg_32_xm0 = S_MOV_B32 0 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -57,12 +57,12 @@ body: | ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -87,15 +87,15 @@ body: | ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 @@ -119,15 +119,15 @@ body: | ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir index 7fe6ce845ab9ea..f2d423a707851f 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -17,7 +17,7 @@ body: | %4:vgpr_32 = V_LSHLREV_B32_e64 killed %3, %0, implicit $exec %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %6:vreg_64 = REG_SEQUENCE killed %4, %subreg.sub0, killed %5, %subreg.sub1 - %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, 0, implicit $exec %8:sreg_32_xm0 = S_MOV_B32 65535 %9:vgpr_32 = COPY %8 %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir index 3ab99551012fd3..1e596b79016ab3 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir @@ -158,10 +158,10 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %12 = V_MOV_B32_e32 1065353216, implicit $exec %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -222,13 +222,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1065353216, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -289,14 +289,14 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -360,16 +360,16 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -427,13 +427,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -494,16 +494,16 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 -2, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -564,13 +564,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 15360, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -631,13 +631,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 80886784, implicit $exec %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -697,13 +697,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 305413120, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir index 65b254e7616ad0..e26f0c934fce46 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir @@ -60,13 +60,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -131,13 +131,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -202,13 +202,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -273,13 +273,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/fold-multiple.mir b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir index ef35b263457953..d8c396c9d4a4fb 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-multiple.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-multiple.mir @@ -34,7 +34,7 @@ body: | %3 = S_LSHL_B32 %1, killed %1, implicit-def dead $scc %4 = V_AND_B32_e64 killed %2, killed %3, implicit $exec %5 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir b/llvm/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir new file mode 100644 index 00000000000000..11af6e19ecb24f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir @@ -0,0 +1,15 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -o - -run-pass si-fix-sgpr-copies -verify-machineinstrs %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: fold_acc_copy_into_valu +# GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY %0.sub0 +# GCN: %2:vgpr_32 = V_AND_B32_e32 [[COPY]], undef %3:vgpr_32, implicit $exec +--- +name: fold_acc_copy_into_valu +body: | + bb.0.entry: + + %0:areg_1024 = IMPLICIT_DEF + %1:sreg_32_xm0 = COPY %0.sub0 + %3:vgpr_32 = V_AND_B32_e32 %1, undef %2:vgpr_32, implicit $exec + +... diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll index b21fd985226790..0573e34e5231b8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -1,12 +1,14 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GFX9 %s ; Test for a conv2d like sequence of loads. -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) { diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll index e4ad729ff715bb..f596578b511efd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s ; uniform loads @@ -82,9 +84,9 @@ define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspac ; CHECK-LABEL: @global_array ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 -; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 -; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 +; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 +; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 +; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir b/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir index 8cc294f57b2680..bd6244127e6f9f 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir @@ -12,7 +12,7 @@ body: | bb.0.entry: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, implicit $exec $vgpr7 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $m0, implicit $exec S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir index 9ef2431df6ee78..d0f32f287473c4 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -57,7 +57,7 @@ body: | BUNDLE implicit-def $sgpr0_sgpr1, implicit $sgpr10_sgpr11 { $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0 } - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -91,5 +91,5 @@ body: | } bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 15f9fb2a6bfb19..9e4ed1e04a4c50 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s @@ -24,25 +26,25 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, s6, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, v3, s2 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v7, -1, v3 +; GFX9-NEXT: v_add_u32_e32 v4, s6, v4 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB0_1 @@ -88,29 +90,29 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2 -; GFX9-NEXT: v_sub_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_not_b32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2 +; GFX9-NEXT: v_not_b32_e32 v6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, 1, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3 -; GFX9-NEXT: v_add_u32_e32 v4, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6 +; GFX9-NEXT: v_add_u32_e32 v5, s6, v5 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v4, s6, v6 ; GFX9-NEXT: v_add_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_add_u32_e32 v5, s6, v5 ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB1_1 @@ -162,15 +164,15 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s6, v4 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: s_add_i32 s6, s6, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 @@ -222,10 +224,10 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v3 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 @@ -275,19 +277,19 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX9-NEXT: BB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v2, v7, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v7 +; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 @@ -326,15 +328,15 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7 +; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 200851712740bf..11a1ca5cf69ba4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s @@ -90,29 +92,29 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -244,22 +246,22 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s2, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s3, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -318,18 +320,18 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -341,18 +343,18 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -364,29 +366,29 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -446,18 +448,18 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -469,18 +471,18 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -492,18 +494,18 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -516,16 +518,16 @@ define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -634,29 +636,29 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -716,18 +718,18 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 0xffff +; GFX8-NEXT: s_and_b32 s6, s3, 0xffff ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -739,18 +741,18 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -762,18 +764,18 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -786,16 +788,16 @@ define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -851,14 +853,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s3, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, s3, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 0xffff -; GFX8-NEXT: s_lshr_b32 s1, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -870,14 +872,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, s3, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -889,14 +891,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, s3, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -906,16 +908,16 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s1, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1024,29 +1026,29 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1155,29 +1157,29 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x4 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x4 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1306,22 +1308,22 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s7 +; GFX10-DL-NEXT: s_and_b32 s4, s4, s7 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX10-DL-NEXT: s_and_b32 s3, s4, s8 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1450,22 +1452,22 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1594,22 +1596,22 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s7, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1742,23 +1744,23 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s5 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1821,19 +1823,19 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1845,19 +1847,19 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1869,19 +1871,19 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1894,17 +1896,17 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s3, s2, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s3, s2, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2039,23 +2041,23 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2119,19 +2121,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2143,19 +2145,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2167,19 +2169,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2192,17 +2194,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i16 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i16 s1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i16 s6, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2338,23 +2340,23 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s4 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2418,19 +2420,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2442,19 +2444,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2466,19 +2468,19 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2491,17 +2493,17 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2693,25 +2695,25 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2720,26 +2722,26 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NODL-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2748,26 +2750,26 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-DL-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2775,7 +2777,6 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 @@ -2784,17 +2785,18 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[2:3], off -; GFX10-DL-NEXT: global_load_ushort v7, v[0:1], off +; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v0, v7, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v1, v3, v1, s2 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, v1, v3, s2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, v0, v2, v1 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index d9a705c735e622..09a8d43ab1e996 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s @@ -46,26 +48,26 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_sext_i32_i8 s5, s2 +; GFX8-NEXT: s_sext_i32_i8 s6, s3 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -77,26 +79,26 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s6, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -108,29 +110,29 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -220,29 +222,29 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s4, s3, 0x80008 -; GFX8-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80008 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v5, v2 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 +; GFX8-NEXT: s_sext_i32_i8 s3, s1 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -251,29 +253,29 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s3, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v5, v2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -551,27 +553,27 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_sext_i32_i8 s5, s2 +; GFX8-NEXT: s_sext_i32_i8 s6, s3 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX8-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v1, s7, v2, v1 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -583,27 +585,27 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s6, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s7, v2, v1 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -615,27 +617,27 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX9-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s7, v2, v1 +; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -648,23 +650,23 @@ define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -749,27 +751,27 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX8-NEXT: s_ashr_i32 s5, s3, 24 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX8-NEXT: s_ashr_i32 s7, s3, 24 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NEXT: s_ashr_i32 s0, s2, 24 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX8-NEXT: s_ashr_i32 s5, s2, 24 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -781,27 +783,27 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s5, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-NODL-NEXT: s_ashr_i32 s7, s3, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s5, s2, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -813,27 +815,27 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s5, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-DL-NEXT: s_ashr_i32 s7, s3, 24 +; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX9-DL-NEXT: s_ashr_i32 s5, s2, 24 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -841,30 +843,30 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 ; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: v_mad_i32_i24 v4, s0, s1, v4 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v4 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v1, v0, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1042,42 +1044,43 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, sext(s0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, sext(s1), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v3 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s1, s3, 0x80000 -; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, sext(s2), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, sext(s3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, sext(s4), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, sext(s5), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, sext(s2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, sext(s5), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, v6, 16, v3 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 4c1dfb4a645aef..0042f93f81f5e2 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s @@ -111,29 +113,29 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -608,24 +610,25 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s1, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 +; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -943,29 +946,30 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* % ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1138,29 +1142,29 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1345,30 +1349,30 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s8, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s5, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s6, s2, v2 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1452,29 +1456,29 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_sext_i32_i8 s4, s3 -; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1483,29 +1487,29 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1514,56 +1518,57 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s3 -; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s7, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1738,29 +1743,29 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s5, v1 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1927,38 +1932,39 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v4, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, s3, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, s2, 16, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -2081,32 +2087,32 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s3, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s2, v3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2117,73 +2123,74 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s3, s4 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_and_b32_sdwa v4, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s1 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 16 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s1, s3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s4 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, s2, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s3, s4 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index a60bf13fab70d8..b7d2e3e88b2830 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s @@ -62,42 +64,42 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -109,42 +111,42 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -156,29 +158,29 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -321,49 +323,49 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX8-NEXT: s_lshr_b32 s1, s2, 12 -; GFX8-NEXT: s_lshr_b32 s7, s4, 12 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_lshr_b32 s2, s0, 12 +; GFX8-NEXT: s_lshr_b32 s4, s1, 12 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s14 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -372,49 +374,49 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX9-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7 -; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -423,99 +425,100 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v6, s1, s8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 +; GFX10-DL-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v4 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v6, s4, s8 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40018 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v5, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s0, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s8, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s11, s12, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v7, s7, s2, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -816,52 +819,53 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40008 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40010 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s1, s9 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40008 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX10-DL-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v5, v4 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s4, s7 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s6, s7, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s8, s0, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s9, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s8, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -992,44 +996,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v4, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s18, v2, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1041,44 +1045,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX9-NEXT: v_mad_i32_i24 v0, s18, v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1090,44 +1094,44 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-DL-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-DL-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v2, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1140,36 +1144,36 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s13, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s8, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s9, s10, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s11, s12, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s4, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s13, s14, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s4, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1313,56 +1317,56 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 4 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 20 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 24 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 28 -; GFX8-NEXT: s_lshl_b32 s9, s5, 8 -; GFX8-NEXT: s_lshl_b32 s11, s5, 12 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 4 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 16 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 20 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 24 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s32 -; GFX8-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s30 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s28 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 20 +; GFX8-NEXT: s_lshl_b32 s11, s5, 8 +; GFX8-NEXT: s_lshl_b32 s13, s5, 12 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s5, 28 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 4 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 8 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 12 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 16 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 20 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 24 +; GFX8-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v1, s34 +; GFX8-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s32 +; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s30 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: v_mov_b32_e32 v1, s28 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s26 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s24 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s20 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s26 +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s24 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s22 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1374,56 +1378,56 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 4 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 16 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 20 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 24 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 28 -; GFX9-NEXT: s_lshl_b32 s9, s5, 8 -; GFX9-NEXT: s_lshl_b32 s11, s5, 12 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 4 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 12 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 16 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 20 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 24 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 28 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s32 -; GFX9-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s30 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 4 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 16 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 20 +; GFX9-NEXT: s_lshl_b32 s11, s5, 8 +; GFX9-NEXT: s_lshl_b32 s13, s5, 12 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 24 +; GFX9-NEXT: s_lshl_b32 s5, s5, 28 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 4 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 12 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 16 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 20 +; GFX9-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 24 +; GFX9-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 28 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s30 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: v_mov_b32_e32 v1, s28 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX9-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-NEXT: v_mov_b32_e32 v3, s24 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s20 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1435,56 +1439,56 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 28 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 8 -; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s32 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s30 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 20 +; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 8 +; GFX9-DL-NEXT: s_lshl_b32 s13, s5, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 24 +; GFX9-DL-NEXT: s_lshl_b32 s5, s5, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s24 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s20 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1497,48 +1501,48 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 28 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 28 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 24 -; GFX10-DL-NEXT: s_lshl_b32 s13, s7, 24 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 28 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 28 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 24 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 20 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 20 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 16 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 20 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 20 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 16 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 16 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2 -; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 ; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12 -; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2 -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 8 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 8 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 8 +; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 4 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 4 +; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[10:11], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 8 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 4 -; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 4 -; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[8:9], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s14, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[10:11], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s12, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[6:7], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s10, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s4, v2 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s6, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1641,68 +1645,68 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 4 -; GFX8-NEXT: s_lshr_b32 s1, s2, 8 -; GFX8-NEXT: s_lshr_b32 s5, s4, 4 -; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX8-NEXT: s_lshr_b32 s8, s0, 4 +; GFX8-NEXT: s_lshr_b32 s9, s0, 8 +; GFX8-NEXT: s_lshr_b32 s15, s1, 4 +; GFX8-NEXT: s_lshr_b32 s16, s1, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 12 -; GFX8-NEXT: s_lshr_b32 s1, s4, 12 +; GFX8-NEXT: s_lshr_b32 s7, s0, 12 +; GFX8-NEXT: s_lshr_b32 s14, s1, 12 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_lshr_b32 s6, s4, 16 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 20 -; GFX8-NEXT: s_lshr_b32 s1, s4, 20 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s1, s4, 28 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s13, s1, 16 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX8-NEXT: s_lshr_b32 s5, s0, 20 +; GFX8-NEXT: s_lshr_b32 s12, s1, 20 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshr_b32 s11, s1, 24 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s10 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v13, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v13, v14, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v15, v17, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v16, v18, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v11, v18, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1713,63 +1717,63 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s6 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_and_b32 s18, s4, 15 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s4 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s5, s4, 15 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s13 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v6, v4, v6 +; GFX9-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1780,131 +1784,132 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s6 +; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s18, s4, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s4 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s13 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v4, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s5 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s9, s10 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s2 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v6, v7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s7, s1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2290,14 +2295,14 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 @@ -2309,113 +2314,113 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 ; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s9 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s16 ; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 12 ; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 ; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v30, 12, s16 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s11 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s17 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s18 -; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2 -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v23, 12, s11 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s17 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v3 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 20 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v22, 12, s10 +; GFX10-DL-NEXT: v_and_b32_e32 v15, v30, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v6, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v14, v14, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v15 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v27, 12, s7 -; GFX10-DL-NEXT: v_and_b32_e32 v14, v31, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v23, 12, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v26, 12, s6 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v22, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v22, 12, v15 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v11, v26, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v26, 12, v7 ; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 20 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16 ; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 20 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v27, 12, v14 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s6 +; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v15, v22, v3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s12 -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 28 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s13 ; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24 ; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 28 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v35, 12, s13 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, v27, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v34, 12, s12 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v26, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v5, v14, v3 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v15 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s15 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v7, v5 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s14 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v18, v35, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v19, v19, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v6, v13 +; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v7, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v10, v10, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v18, v18, v3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v13 +; GFX10-DL-NEXT: v_and_b32_e32 v19, v34, v3 ; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v16, v16, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v17, v17, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v8, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v16, v16, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v17, v17, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v10 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v30, 12, v11 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v19 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v35, 12, v18 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v19 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v18, 12, v18 ; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v15, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v31, 12, v10 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v16, 12, v16 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v11, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, v9, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v9, v10, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v17, 12, v17 +; GFX10-DL-NEXT: v_and_b32_e32 v10, v12, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v11, v14, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v17, 12, v17 -; GFX10-DL-NEXT: v_and_b32_e32 v10, v12, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v19, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v35, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v16, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v31, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, v30, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v6, v18, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v13, v16, v3 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v10, v11 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v17, v2 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v7, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v12, v17, v3 ; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v7, v6 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v13 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v9, v12 ; GFX10-DL-NEXT: v_and_b32_sdwa v9, v10, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v11, v4, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v8, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v11, v10 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index e80095b4899e04..5fb742b7dbaa7b 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s @@ -62,42 +64,42 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX8-NEXT: s_lshr_b32 s6, s2, 28 +; GFX8-NEXT: s_lshr_b32 s13, s4, 28 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s12 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -109,42 +111,42 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -156,29 +158,29 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -304,45 +306,45 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -352,45 +354,45 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -400,87 +402,88 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -606,45 +609,45 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -654,45 +657,45 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -702,87 +705,88 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -909,47 +913,47 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -960,47 +964,47 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1011,47 +1015,47 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1059,44 +1063,45 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1208,47 +1213,47 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -1259,47 +1264,47 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1310,47 +1315,47 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1358,44 +1363,45 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s0, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s8 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1509,44 +1515,44 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX8-NEXT: s_lshr_b32 s6, s2, 28 +; GFX8-NEXT: s_bfe_u32 s19, s4, 0x40004 +; GFX8-NEXT: s_lshr_b32 s13, s4, 28 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s18, s4, 0x40008 ; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s15 -; GFX8-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 -; GFX8-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1558,44 +1564,44 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_bfe_u32 s19, s4, 0x40004 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 ; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1607,44 +1613,44 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s19, s4, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s18, s4, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s4, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1657,36 +1663,36 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_and_b32 s6, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s7, s4, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v0 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s7, s8, v3 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s9, s10, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s11, s12, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s13, s14, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s4, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1815,42 +1821,42 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX8-NEXT: s_lshr_b32 s6, s2, 28 +; GFX8-NEXT: s_lshr_b32 s13, s4, 28 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s12 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1862,42 +1868,42 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1909,29 +1915,29 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2032,45 +2038,45 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2082,51 +2088,51 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s5, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s14, s15 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_and_b32 s18, s4, 15 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s16, s17 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s6, v0 +; GFX9-NEXT: s_and_b32 s11, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s6, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2137,96 +2143,97 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s12, s12, s13 +; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x40014 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s5, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s14, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s18, s4, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s16, s17 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s6, v0 +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s6, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s18, s4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s6, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s7, s0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s8, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s8 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s5, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s7, s8 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s1, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s5 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 @@ -2548,65 +2555,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 -; GFX10-DL-NEXT: s_and_b32 s8, s5, 15 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s9 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s4 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s6, s8 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s2, s4 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s6, s0 -; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40018 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s7 ; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s1, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s7, s9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, s0, s4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v6 -; GFX10-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s2, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s0, s9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v6 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v14 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s6, s1 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v8, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2697,47 +2705,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2748,47 +2756,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -2799,47 +2807,47 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -2847,44 +2855,45 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -2974,42 +2983,42 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s3, 15 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40018 +; GFX8-NEXT: s_and_b32 s5, s2, 15 +; GFX8-NEXT: s_and_b32 s6, s3, 15 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s17, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v2, v3 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x40004 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s3, 0x4000c -; GFX8-NEXT: s_bfe_u32 s12, s3, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s3, 0x40014 -; GFX8-NEXT: s_bfe_u32 s16, s3, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX8-NEXT: s_bfe_u32 s8, s3, 0x40004 +; GFX8-NEXT: s_bfe_u32 s10, s3, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s3, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX8-NEXT: s_bfe_u32 s16, s3, 0x40014 +; GFX8-NEXT: s_bfe_u32 s18, s3, 0x40018 ; GFX8-NEXT: s_lshr_b32 s3, s3, 28 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s16, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s16, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v2, s18, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3021,42 +3030,42 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s3, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40018 +; GFX9-NEXT: s_and_b32 s5, s2, 15 +; GFX9-NEXT: s_and_b32 s6, s3, 15 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v2, v3 -; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX9-NEXT: s_bfe_u32 s8, s3, 0x40004 +; GFX9-NEXT: s_bfe_u32 s10, s3, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s18, s3, 0x40018 ; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s16, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s16, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v2, s18, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -3068,29 +3077,29 @@ define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll index abcf3342fcf45b..e7ad0bd0122e6e 100644 --- a/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -1,5 +1,5 @@ ; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s -; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN %s ; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy ; GCN: ; illegal copy v1 to s9 @@ -43,7 +43,8 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy -; GCN: ; illegal copy a1 to s9 +; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1 +; GCN: ; illegal copy [[COPY1]] to s9 define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) @@ -51,7 +52,9 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy -; GCN: ; illegal copy a[0:1] to s[10:11] +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0 +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1 +; GCN: ; illegal copy v{{\[}}[[COPY1L]]:[[COPY1H]]] to s[10:11] define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 053a6844076405..40722ce43741f0 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -104,7 +104,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) ; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 16 from %stack.1, align 4, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM 0 entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir index 09f1ba90106080..b305cfddb5a5d2 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir @@ -41,7 +41,7 @@ body: | ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; CHECK: bb.2: ; CHECK: S_ENDPGM 0 bb.0: @@ -51,7 +51,7 @@ body: | bb.1: successors: %bb.2 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir index 9f39dc34150993..5797bb5cfa29a9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir @@ -49,10 +49,10 @@ body: | bb.0 (%ir-block.2): $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) EXP_DONE 0, killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, -1, -1, 15, implicit $exec $vgpr0 = V_MOV_B32_e32 1056964608, implicit $exec $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index f31c1b5c87fb9b..2a02a9a9ee4d1b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,CIVI,VI %s @@ -515,19 +517,19 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x3e7 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x3e7 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v3, s4, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0: @@ -538,14 +540,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0: @@ -556,14 +558,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -581,19 +583,19 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: @@ -605,15 +607,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: s_lshr_b32 s1, s4, 16 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, s1, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: @@ -625,15 +627,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: s_lshr_b32 s1, s4, 16 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, s1, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -652,18 +654,18 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v3, 53, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, 53, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -674,14 +676,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 53, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 53, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -692,14 +694,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 53, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 53, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -717,37 +719,37 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x3e7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3e70000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1: @@ -758,14 +760,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -786,32 +788,32 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, -15, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, -15, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xfff10000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: @@ -822,14 +824,14 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -846,19 +848,19 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0: @@ -869,14 +871,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0: @@ -887,14 +889,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -915,14 +917,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, 53 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 53 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -933,14 +935,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 53, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 53, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -951,14 +953,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 53, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 53, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -975,37 +977,37 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x4500 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x45000000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1: @@ -1016,14 +1018,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1044,32 +1046,32 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, 35, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, 35, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x230000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x230000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: @@ -1080,14 +1082,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspac ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1167,20 +1169,20 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, s0, v3, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1188,20 +1190,20 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 -; VI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 4 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s0, 0xffff, s2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, s0, v3, v4 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1209,20 +1211,20 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 -; CI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] -; CI-NEXT: s_lshl_b32 s2, s4, 4 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshl_b32 s0, 0xffff, s2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: s_lshl_b32 s0, s4, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v2, s0, v3, v4 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1239,80 +1241,80 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, 0xffff -; GFX9-NEXT: s_mov_b32 s7, 0x12341234 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x12341234 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v2, s7, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0x12341234 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; VI-NEXT: s_mov_b32 s0, 0x12341234 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, v2, s7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_bfi_b32 v0, v1, s0, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s6, 0x12341234 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v2, v[2:3] +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_mov_b32 s0, 0x12341234 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v2, v2, s6, v3 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_bfi_b32 v0, v1, s0, v0 +; CI-NEXT: flat_store_dword v[4:5], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1356,13 +1358,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, s1, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1376,13 +1378,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, s1, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1423,14 +1425,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1447,13 +1449,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, s1, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1498,13 +1500,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1518,13 +1520,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1565,14 +1567,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1589,13 +1591,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1640,13 +1642,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1660,13 +1662,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1686,53 +1688,52 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v5, s1, v1 -; GFX9-NEXT: v_bfi_b32 v0, v4, s1, v0 +; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1 +; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 s5, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s6, s4 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s1, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s2, s4, s0 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_lshl_b32 s3, s2, 16 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] +; VI-NEXT: s_or_b32 s0, s2, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1744,26 +1745,26 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_load_dword s6, s[4:5], 0x4 -; CI-NEXT: s_mov_b32 s4, 0xffff +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_mov_b32 s6, 0xffff ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_mov_b32 s5, 0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s2, s6, 16 -; CI-NEXT: s_and_b32 s3, s6, s4 +; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: s_and_b32 s3, s4, s6 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_or_b32 s1, s3, s2 +; CI-NEXT: s_or_b32 s0, s3, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; CI-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_bfi_b32 v1, v5, s1, v1 -; CI-NEXT: v_bfi_b32 v0, v4, s1, v0 +; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 +; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1785,24 +1786,24 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX9-NEXT: s_lshl_b32 s2, s5, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s3, s5, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0 +; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1 +; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1811,20 +1812,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_and_b32 s2, s4, s6 -; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s4, s5, 4 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_lshl_b32 s2, s5, 4 +; VI-NEXT: s_and_b32 s3, s4, s0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: s_lshl_b32 s2, s3, 16 +; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1839,20 +1840,20 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; CI-NEXT: s_mov_b32 s6, 0xffff -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_and_b32 s2, s4, s6 -; CI-NEXT: s_lshl_b32 s3, s4, 16 -; CI-NEXT: s_or_b32 s2, s2, s3 -; CI-NEXT: s_lshl_b32 s4, s5, 4 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_and_b32 s2, s4, s0 +; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_or_b32 s2, s2, s4 +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: s_lshl_b32 s3, s5, 4 +; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_mov_b32_e32 v5, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll index 34e1d201c9cb62..05cb31c70722d3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -1,12 +1,13 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e703e7 - -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll index 95e38c36e62a0b..f70bd112d2ef64 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -1,9 +1,11 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7 diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir index 6e67f7df30a7bd..1ab10fa92f7b89 100644 --- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -230,17 +230,17 @@ name: vmem_gt_8dw_store body: | bb.0: - BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec BUFFER_ATOMIC_CMPSWAP_X2_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -553,10 +553,10 @@ body: | dead $sgpr6_sgpr7 = KILL $sgpr4_sgpr5 $sgpr8 = S_MOV_B32 $sgpr5 $vgpr0 = V_MOV_B32_e32 killed $sgpr8, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr + 4) + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr + 4) $sgpr8 = S_MOV_B32 $sgpr4, implicit killed $sgpr4_sgpr5 $vgpr0 = V_MOV_B32_e32 killed $sgpr8, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr) + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir index 48f0be4ff8fd70..0a60eaf7c03f91 100644 --- a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir +++ b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir @@ -64,7 +64,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec S_BRANCH %bb.3 @@ -72,7 +72,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec bb.3.done: @@ -80,7 +80,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir index 69c038b976b8ac..566b1c06fb125e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir @@ -12,7 +12,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -24,7 +24,7 @@ name: hazard_buf_branch_lds body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -56,11 +56,11 @@ name: no_hazard_buf_branch_buf body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -75,7 +75,7 @@ body: | $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -87,7 +87,7 @@ name: no_hazard_lds_branch_buf_samebb body: | bb.0: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -101,7 +101,7 @@ body: | bb.0: successors: %bb.0 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... @@ -118,8 +118,8 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -137,7 +137,7 @@ body: | bb.1: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -150,11 +150,11 @@ body: | bb.0: successors: %bb.1 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -171,7 +171,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -189,7 +189,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -206,7 +206,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr0, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -223,7 +223,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 90d7b693b60140..3466e05c00f379 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI @@ -21,10 +23,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -33,10 +35,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -62,8 +64,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1) ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -73,8 +75,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1) ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -123,14 +125,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: @@ -145,14 +147,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -191,13 +193,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: @@ -208,13 +210,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -251,13 +253,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: @@ -268,13 +270,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -318,14 +320,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: @@ -340,14 +342,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -394,14 +396,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: @@ -416,14 +418,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -470,14 +472,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: @@ -492,14 +494,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -547,14 +549,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: @@ -569,14 +571,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll index cdcf7383afc091..318fe260552676 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s @@ -149,7 +151,7 @@ define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* % ; NOLOOP: s_mov_b32 m0, 0{{$}} ; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NOLOOP-NEXT: load_dword +; NOLOOP: load_dword define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) %load = load volatile i32, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll index 8d5cdf1c27049f..b3bd5e6d90379a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s @@ -304,7 +306,7 @@ define amdgpu_kernel void @v_icmp_i16_sle(i64 addrspace(1)* %out, i16 %src) { ; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1 ; GCN-NEXT: v_mov_b32_e32 ; GCN-NEXT: v_mov_b32_e32 -; GCN-NEXT: {{global|flat|buffer}}_store_dwordx2 +; GCN: {{global|flat|buffer}}_store_dwordx2 define amdgpu_kernel void @v_icmp_i1_ne0(i64 addrspace(1)* %out, i32 %a, i32 %b) { %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index 958a72566b5f7b..9f18f4df40bf62 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -400,6 +400,46 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_swizzled_not_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 +;CHECK: s_waitcnt +define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 8) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 8) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 8) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 8) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index 7d1a5a3b99a0f2..1bfe0aa4086e7e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -276,6 +276,37 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_swizzled_not_merged: +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 +define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 8) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 0bf3125b22bf68..f5bbc31e98b540 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,VARIANT0 %s ; RUN: llc -march=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,VARIANT1 %s @@ -9,11 +11,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT0: ; %bb.0: ; %entry ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -30,11 +32,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT1: ; %bb.0: ; %entry ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier @@ -51,45 +53,45 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0 -; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT2-NEXT: v_mov_b32_e32 v2, s3 +; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] +; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; VARIANT2-NEXT: global_store_dword v[1:2], v0, off +; VARIANT2-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] -; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT2-NEXT: global_store_dword v[3:4], v0, off -; VARIANT2-NEXT: v_mov_b32_e32 v5, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 -; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc +; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier -; VARIANT2-NEXT: global_load_dword v0, v[0:1], off +; VARIANT2-NEXT: global_load_dword v0, v[3:4], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v[3:4], v0, off +; VARIANT2-NEXT: global_store_dword v[1:2], v0, off ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0 -; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT3-NEXT: v_mov_b32_e32 v2, s3 +; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] +; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; VARIANT3-NEXT: global_store_dword v[1:2], v0, off +; VARIANT3-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] -; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT3-NEXT: global_store_dword v[3:4], v0, off -; VARIANT3-NEXT: v_mov_b32_e32 v5, s3 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 -; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc +; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc ; VARIANT3-NEXT: s_barrier -; VARIANT3-NEXT: global_load_dword v0, v[0:1], off +; VARIANT3-NEXT: global_load_dword v0, v[3:4], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v[3:4], v0, off +; VARIANT3-NEXT: global_store_dword v[1:2], v0, off ; VARIANT3-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index f10b504a92bf45..9e4779ded4bbcd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI %s @@ -1537,8 +1539,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, s2 diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index 1577d8b737db21..b8b963edf66e38 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s @@ -5,11 +7,11 @@ ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 -; GFX900-NEXT: ds_write_b16 v3, v2 +; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GFX900-DAG: s_waitcnt lgkmcnt(0) +; GFX900-DAG: v_mov_b32_e32 v1, v2 +; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16 +; GFX900: ds_write_b16 [[ZERO]], v2 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -27,14 +29,13 @@ entry: ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v1, v0 -; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0 +; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16 +; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]] +; GFX900-DAG: s_waitcnt lgkmcnt(0) +; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]] +; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]] ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index 66b0564fa59fe1..71b24f98979f26 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s @@ -518,12 +520,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v3, v0 -; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX900-NEXT: ds_write_b16 v2, v0 +; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -531,12 +533,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: ds_write_b16 v3, v0 -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: ds_write_b16 v2, v0 +; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -580,10 +582,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: ds_write_b16 v3, v2 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX906-NEXT: v_mov_b32_e32 v4, 0 -; GFX906-NEXT: ds_write_b16 v4, v3 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off @@ -594,13 +596,13 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX803-NEXT: ds_write_b16 v3, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_mov_b32_e32 v2, 0 +; GFX803-NEXT: ds_write_b16 v2, v1 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -618,12 +620,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: ds_write_b16 v3, v5 -; GFX900-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX900-NEXT: ds_write_b16 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -632,12 +634,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX906-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 -; GFX906-NEXT: ds_write_b16 v3, v5 -; GFX906-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX906-NEXT: ds_write_b16 v3, v4 +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -648,11 +650,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: ds_write_b16 v3, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -994,13 +996,13 @@ define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 { ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; FIXME: and should be removable ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; FIXME: and should be removable entry: %reg.bc = bitcast i32 %reg to <2 x half> %load = load half, half* %in @@ -1034,10 +1036,10 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1116,10 +1118,10 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 { ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1523,11 +1525,11 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1605,11 +1607,11 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll index a8f5603ef23741..5ca8e0baa18ba7 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s @@ -112,22 +114,22 @@ bb: define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] -; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1 +; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, 0xfffff9, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index dda1efdbc17847..604d43f2b36eb4 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -225,12 +225,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; FIXME: Should be able to use mixlo/mixhi ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt: -; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9: v_cvt_f16_f32_e32 v1, v3 +; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX9: v_cvt_f16_f32_e32 v0, v0 -; GFX9: v_and_b32_e32 v1, 0xffff, v1 -; GFX9: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9: v_cvt_f16_f32_e32 v1, v3 +; GFX9: v_and_b32_e32 v0, 0xffff, v0 +; GFX9: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 8ae3e24119d0a7..53ecf1fd833a18 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CI %s ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s @@ -101,7 +103,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 { ; CI: v_bfe_i32 v[[B1:[0-9]+]], v1, 0, 31 ; CI: v_ashr_i64 ; CI: v_bfe_i32 v[[B2:[0-9]+]], v0, 0, 31 -; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v[[B2]], v[[B1]], v[1:2] +; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v[[B2]], v[[B1]], v{{\[[0-9]+:[0-9]+\]}} define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { %sext0 = sext i31 %arg0 to i63 %sext1 = sext i31 %arg1 to i63 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index 371831f282312c..03de2476a8f449 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VIPLUS,VI ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VIPLUS,GFX9 @@ -17,13 +19,13 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_i16: @@ -39,13 +41,13 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_i16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -74,15 +76,15 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, < ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v3, v2 -; VI-NEXT: v_max_i16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v1, v0, v2 +; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v2i16: @@ -98,13 +100,13 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, < ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v3, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid @@ -124,35 +126,35 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v8 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v8 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v2 -; VI-NEXT: flat_load_dword v9, v[0:1] -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v4, v[4:5] -; VI-NEXT: flat_load_dword v5, v[2:3] -; VI-NEXT: flat_load_ushort v6, v[6:7] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v8 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v2 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_ushort v3, v[8:9] +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_ushort v1, v[6:7] +; VI-NEXT: v_add_u32_e32 v10, vcc, 4, v4 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v7, v5, v9 -; VI-NEXT: v_max_i16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i16_e32 v6, v0, v2 +; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v6, v4 -; VI-NEXT: v_or_b32_e32 v5, v7, v5 -; VI-NEXT: flat_store_dword v[0:1], v5 -; VI-NEXT: flat_store_short v[2:3], v4 +; VI-NEXT: v_max_i16_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v6, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: flat_store_short v[10:11], v1 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: @@ -272,13 +274,13 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sgt_i16: @@ -294,13 +296,13 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_i16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -329,13 +331,13 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_uge_i16: @@ -351,13 +353,13 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_u16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -386,13 +388,13 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_i16: @@ -408,13 +410,13 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_u16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -442,15 +444,15 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, < ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v4, v3, v2 -; VI-NEXT: v_max_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v1, v0, v2 +; VI-NEXT: v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: @@ -466,13 +468,13 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, < ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_u16 v2, v3, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_u16 v0, v0, v2 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir index f33c2115dcb25a..673fff50b39c43 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -86,7 +86,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 0 S_WAITCNT 127 - $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) + $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir index 1046b9729df46d..99348a57b9f6a8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir @@ -23,13 +23,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_CBRANCH_SCC0 %bb.1, implicit killed $scc bb.2: @@ -55,7 +55,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir index bf24ce15acb623..f52275af48c9e7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir @@ -117,13 +117,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -149,7 +149,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir index a6088b0677a06f..c543b80454b62e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir @@ -97,13 +97,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -129,7 +129,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.mir b/llvm/test/CodeGen/AMDGPU/memory_clause.mir index ac412a8fc29b63..b46cfb16b7ba5b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.mir +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.mir @@ -337,7 +337,7 @@ body: | # GCN: dead early-clobber %4:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vgpr_32 = BUNDLE %0, %2, %1, implicit $exec { # GCN-NEXT: dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -357,7 +357,7 @@ body: | %2 = IMPLICIT_DEF %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, implicit $exec + %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: atomic{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir index 09f7a5bbdc5737..af1af2487f84e1 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-image-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-load.mir @@ -3,7 +3,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s # GFX9-LABEL: name: image_load_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -15,13 +15,13 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v1v3_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -33,14 +33,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v2v2 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3 @@ -52,14 +52,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sreg_256, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sreg_256, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v2v2_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1 @@ -71,14 +71,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sreg_256, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sreg_256, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v3v1 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4), (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3 @@ -90,14 +90,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v3v1_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4), (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -109,14 +109,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_load_divided_merged -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) name: image_load_divided_merged body: | @@ -126,12 +126,12 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %9:vreg_96 = IMAGE_LOAD_V3_V4 %7:vreg_128, %3:sreg_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %11:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- @@ -148,7 +148,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vreg_128 = COPY %2 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sreg_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) @@ -167,7 +167,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -185,7 +185,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sreg_256, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sreg_256, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -203,8 +203,8 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -223,7 +223,7 @@ body: | %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -241,7 +241,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -259,7 +259,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -277,7 +277,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -295,7 +295,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -313,7 +313,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -331,7 +331,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -349,7 +349,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -367,14 +367,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_mip_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -386,7 +386,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V4 %5:vreg_128, %3:sreg_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_MIP_V3_V4 %5:vreg_128, %3:sreg_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -395,7 +395,7 @@ body: | # GFX9-LABEL: name: image_load_mip_pck_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -407,7 +407,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V4 %5:vreg_128, %3:sreg_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V4 %5:vreg_128, %3:sreg_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -416,7 +416,7 @@ body: | # GFX9-LABEL: name: image_load_mip_pck_sgn_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -428,14 +428,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V4 %5:vreg_128, %3:sreg_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V4 %5:vreg_128, %3:sreg_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_pck_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -447,14 +447,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V4 %5:vreg_128, %3:sreg_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_PCK_V3_V4 %5:vreg_128, %3:sreg_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_pck_sgn_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -466,7 +466,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V4 %5:vreg_128, %3:sreg_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V4 %5:vreg_128, %3:sreg_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... diff --git a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir index 93c7a828922e5e..72d842db73a07b 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-image-sample.mir @@ -3,7 +3,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s # GFX9-LABEL: name: image_sample_l_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -15,13 +15,13 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v1v3_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -33,14 +33,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v2v2 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3 @@ -52,14 +52,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v2v2_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1 @@ -71,14 +71,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v3v1 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4), (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3 @@ -90,14 +90,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v3v1_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4), (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -109,14 +109,14 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_divided_merged -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) name: image_sample_l_divided_merged body: | @@ -126,12 +126,12 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %9:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %7:vreg_128, %3:sreg_256, %2:sreg_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %11:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- @@ -148,7 +148,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vreg_128 = COPY %2 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sreg_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) @@ -167,7 +167,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -185,7 +185,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -203,8 +203,8 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -223,7 +223,7 @@ body: | %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -242,7 +242,7 @@ body: | %3:sreg_128 = COPY $sgpr92_sgpr93_sgpr94_sgpr95 %4:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -260,7 +260,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -278,7 +278,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -296,7 +296,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -314,7 +314,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -332,7 +332,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -350,7 +350,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -368,7 +368,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -386,7 +386,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -396,7 +396,7 @@ body: | # GFX9-LABEL: name: image_sample_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -408,7 +408,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -416,7 +416,7 @@ body: | # GFX9-LABEL: name: image_sample_b_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -428,7 +428,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -436,7 +436,7 @@ body: | # GFX9-LABEL: name: image_sample_b_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -448,7 +448,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -456,7 +456,7 @@ body: | # GFX9-LABEL: name: image_sample_b_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -468,7 +468,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -476,7 +476,7 @@ body: | # GFX9-LABEL: name: image_sample_b_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -488,7 +488,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -496,7 +496,7 @@ body: | # GFX9-LABEL: name: image_sample_c_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -508,7 +508,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -516,7 +516,7 @@ body: | # GFX9-LABEL: name: image_sample_cd_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -528,7 +528,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -536,7 +536,7 @@ body: | # GFX9-LABEL: name: image_sample_cd_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -548,7 +548,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -556,7 +556,7 @@ body: | # GFX9-LABEL: name: image_sample_cd_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -568,7 +568,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -576,7 +576,7 @@ body: | # GFX9-LABEL: name: image_sample_cd_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -588,7 +588,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -596,7 +596,7 @@ body: | # GFX9-LABEL: name: image_sample_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -608,7 +608,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -616,7 +616,7 @@ body: | # GFX9-LABEL: name: image_sample_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -628,7 +628,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -636,7 +636,7 @@ body: | # GFX9-LABEL: name: image_sample_c_b_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -648,7 +648,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -656,7 +656,7 @@ body: | # GFX9-LABEL: name: image_sample_c_b_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -668,7 +668,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -676,7 +676,7 @@ body: | # GFX9-LABEL: name: image_sample_c_b_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -688,7 +688,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -696,7 +696,7 @@ body: | # GFX9-LABEL: name: image_sample_c_b_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -708,7 +708,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -716,7 +716,7 @@ body: | # GFX9-LABEL: name: image_sample_c_cd_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -728,7 +728,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -736,7 +736,7 @@ body: | # GFX9-LABEL: name: image_sample_c_cd_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -748,7 +748,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -756,7 +756,7 @@ body: | # GFX9-LABEL: name: image_sample_c_cd_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -768,7 +768,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -776,7 +776,7 @@ body: | # GFX9-LABEL: name: image_sample_c_cd_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -788,7 +788,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -796,7 +796,7 @@ body: | # GFX9-LABEL: name: image_sample_c_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -808,7 +808,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -816,7 +816,7 @@ body: | # GFX9-LABEL: name: image_sample_c_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -828,7 +828,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -836,7 +836,7 @@ body: | # GFX9-LABEL: name: image_sample_c_d_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -848,7 +848,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -856,7 +856,7 @@ body: | # GFX9-LABEL: name: image_sample_c_d_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -868,7 +868,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -876,7 +876,7 @@ body: | # GFX9-LABEL: name: image_sample_c_d_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -888,7 +888,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -896,7 +896,7 @@ body: | # GFX9-LABEL: name: image_sample_c_d_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -908,7 +908,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -916,7 +916,7 @@ body: | # GFX9-LABEL: name: image_sample_c_l_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -928,7 +928,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -936,7 +936,7 @@ body: | # GFX9-LABEL: name: image_sample_c_lz_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -948,7 +948,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -956,7 +956,7 @@ body: | # GFX9-LABEL: name: image_sample_c_lz_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -968,7 +968,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -976,7 +976,7 @@ body: | # GFX9-LABEL: name: image_sample_c_l_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -988,7 +988,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -996,7 +996,7 @@ body: | # GFX9-LABEL: name: image_sample_c_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1008,7 +1008,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1016,7 +1016,7 @@ body: | # GFX9-LABEL: name: image_sample_d_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1028,7 +1028,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1036,7 +1036,7 @@ body: | # GFX9-LABEL: name: image_sample_d_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1048,7 +1048,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1056,7 +1056,7 @@ body: | # GFX9-LABEL: name: image_sample_d_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1068,7 +1068,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1076,7 +1076,7 @@ body: | # GFX9-LABEL: name: image_sample_d_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1088,7 +1088,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1096,7 +1096,7 @@ body: | # GFX9-LABEL: name: image_sample_lz_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1108,7 +1108,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1116,7 +1116,7 @@ body: | # GFX9-LABEL: name: image_sample_lz_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1128,7 +1128,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1136,7 +1136,7 @@ body: | # GFX9-LABEL: name: image_sample_l_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1148,7 +1148,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1156,7 +1156,7 @@ body: | # GFX9-LABEL: name: image_sample_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 4, addrspace 4), (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1168,7 +1168,7 @@ body: | %2:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sreg_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_O_V3_V4 %5:vreg_128, %3:sreg_256, %2:sreg_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir index becd2e1b9c1ec8..6bff48467b5942 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir @@ -169,10 +169,10 @@ body: | --- # CHECK-LABEL: merge_mmos # CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 name: merge_mmos tracksRegLiveness: true body: | @@ -182,14 +182,14 @@ body: | %0:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 4) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 1, 0, 0 :: (dereferenceable invariant load 4) - %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) - %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) - BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) - BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) + BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) + BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index 9cfd92f86c4fb5..ccff6bb51275e4 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -25,7 +25,7 @@ # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -47,7 +47,7 @@ # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -72,7 +72,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -94,7 +94,7 @@ body: | # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -116,7 +116,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -141,7 +141,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -163,7 +163,7 @@ body: | # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -185,7 +185,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -210,7 +210,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -226,7 +226,7 @@ body: | # ADDR64: %9:vgpr_32, %12:sreg_64_xexec = V_ADD_I32_e64 %14.sub0, %4.sub0, 0, implicit $exec # ADDR64: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADDC_U32_e64 %14.sub1, %4.sub1, killed %12, 0, implicit $exec # ADDR64: %11:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: addr64 liveins: @@ -246,7 +246,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -269,7 +269,7 @@ body: | # W64-NO-ADDR64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64-NO-ADDR64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-NO-ADDR64-LABEL bb.2: @@ -289,7 +289,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -303,7 +303,7 @@ body: | # ADDR64: [[RSRCFMTHI:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 # ADDR64: [[ZERORSRC:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[ZERO64]], %subreg.sub0_sub1, [[RSRCFMTLO]], %subreg.sub2, [[RSRCFMTHI]], %subreg.sub3 # ADDR64: [[VADDR64:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[RSRCPTR]].sub0, %subreg.sub0, [[RSRCPTR]].sub1, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: offset @@ -324,7 +324,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll index a668a19c2f0cf0..3c5a65146e6179 100644 --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}sample_contig_nsa: @@ -21,8 +23,8 @@ main_body: } ; GCN-LABEL: {{^}}sample_contig_nsa_10vgprs: -; GCN-DAG: image_sample_c_l v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}], -; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], +; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], define amdgpu_ps <2 x float> @sample_contig_nsa_10vgprs(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) #0 { main_body: %zcompare.1 = fadd float %zcompare, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir index 0dd135723d8cd9..39d3efe2a1de39 100644 --- a/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -9,7 +9,7 @@ name: hazard_image_sample_d_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_off1 @@ -20,7 +20,7 @@ name: no_hazard_image_sample_d_buf_off1 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_far @@ -33,7 +33,7 @@ body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec V_NOP_e32 implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # Non-NSA @@ -45,7 +45,7 @@ name: no_hazard_image_sample_v4_v2_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 undef $vgpr1_vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # Less than 4 dwords @@ -57,5 +57,5 @@ name: no_hazard_image_sample_v4_v3_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V3_nsa_gfx10 undef $vgpr1, undef $vgpr2, undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index 3af2f0457fbba1..0e7708210a90e7 100644 --- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -137,7 +137,7 @@ body: | %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -243,7 +243,7 @@ body: | %37 = REG_SEQUENCE %6, 17, killed %36, 18 %38 = V_MOV_B32_e32 0, implicit $exec %39 = COPY %33 - BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -332,7 +332,7 @@ body: | %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir index 7da7dc8d319920..53ca546969b9c1 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir @@ -151,7 +151,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -159,7 +159,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -188,7 +188,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -196,7 +196,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -225,7 +225,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -233,14 +233,14 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- # CHECK-LABEL: name: optimize_if_and_saveexec_xor_valu_middle # CHECK: $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc -# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec +# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec # CHECK-NEXT: $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3 # CHECK-NEXT: SI_MASK_BRANCH @@ -255,7 +255,7 @@ body: | $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec $vgpr0 = V_MOV_B32_e32 4, implicit $exec $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc $exec = S_MOV_B64_term killed $sgpr2_sgpr3 SI_MASK_BRANCH %bb.2, implicit $exec @@ -266,7 +266,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -274,7 +274,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -304,7 +304,7 @@ body: | bb.1.if: liveins: $sgpr0_sgpr1 , $sgpr4_sgpr5_sgpr6_sgpr7 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr4_sgpr5_sgpr6_sgpr7 @@ -312,7 +312,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -346,7 +346,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -356,7 +356,7 @@ body: | $sgpr1 = S_MOV_B32 1 $sgpr2 = S_MOV_B32 -1 $sgpr3 = S_MOV_B32 61440 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,7 +387,7 @@ body: | S_SLEEP 0, implicit $sgpr2_sgpr3 $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -395,7 +395,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -426,7 +426,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -434,7 +434,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -463,7 +463,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -471,7 +471,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -500,7 +500,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -508,7 +508,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -539,7 +539,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -547,6 +547,6 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir index 7a1cfa32a60c3d..39915f2755ce0f 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -27,12 +27,12 @@ body: | ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir index 3c99cc7c19daad..807029a92f3489 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -46,7 +46,7 @@ body: | %15:sreg_32_xm0 = S_MOV_B32 61440 %16:sreg_32_xm0 = S_MOV_B32 -1 %17:sreg_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3 - BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) %19:vgpr_32 = COPY %4 %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 diff --git a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir index ed648ece0c71af..66bd4c163c669a 100644 --- a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir +++ b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir @@ -17,6 +17,6 @@ body: | S_BARRIER $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32 undef $vgpr0, undef $vgpr0, 0, 0, 0, 2, implicit $exec $vgpr0 = V_ACCVGPR_READ_B32 $agpr31, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir index 196301f4cb07bf..1d9ab685c5320e 100644 --- a/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir +++ b/llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir @@ -185,7 +185,7 @@ body: | bb.28: %9 = S_FF1_I32_B32 undef %10 %13 = V_MAD_U32_U24 killed %9, 48, 32, 0, implicit $exec - %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) %46 = V_AND_B32_e32 1, killed %45, implicit $exec %21 = S_BUFFER_LOAD_DWORD_SGPR undef %22, undef %23, 0, 0 :: (dereferenceable invariant load 4) %25 = V_CMP_GE_F32_e64 0, 0, 0, killed %21, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir index 016c5ad3202397..a92fe49e1b7374 100644 --- a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir +++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir @@ -70,7 +70,7 @@ body: | %13.sub2_sub3 = COPY killed %12 %20 = V_LSHL_B64 killed %19, 2, implicit $exec %16 = COPY killed %5 - BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/regcoalescer-assert-from-incorrect-subrange-extension.mir b/llvm/test/CodeGen/AMDGPU/regcoalescer-assert-from-incorrect-subrange-extension.mir index f866d2d151efaa..bf4b61d24ce608 100644 --- a/llvm/test/CodeGen/AMDGPU/regcoalescer-assert-from-incorrect-subrange-extension.mir +++ b/llvm/test/CodeGen/AMDGPU/regcoalescer-assert-from-incorrect-subrange-extension.mir @@ -64,7 +64,7 @@ body: | %12:sgpr_32 = COPY killed $sgpr0 %64:vgpr_32 = V_ADD_I32_e32 killed %12, killed %13, implicit-def dead $vcc, implicit $exec %17:sreg_32_xm0 = S_MOV_B32 0 - %18:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN killed %64, undef %20:sreg_128, 0, 28, 116, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %18:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN killed %64, undef %20:sreg_128, 0, 28, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) dead %65:vgpr_32 = V_BFE_U32 killed %18, 28, 1, implicit $exec S_CBRANCH_SCC0 %bb.2, implicit undef $scc diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir index 0413075dd86c9f..331cccd853c246 100644 --- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir @@ -134,10 +134,10 @@ body: | %6.sub2 = COPY %6.sub0 bb.2: - BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 S_SETPC_B64_return $sgpr30_sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index f57ca3ad600598..088556fed5e516 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}v_sad_u32_pat1: @@ -255,10 +257,10 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) { ; GCN-LABEL: {{^}}s_sad_u32_i8_pat2: ; GCN: s_load_dword ; GCN: s_bfe_u32 -; GCN: s_sub_i32 -; GCN: s_and_b32 -; GCN: s_sub_i32 -; GCN: s_lshr_b32 +; GCN-DAG: s_sub_i32 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_sub_i32 +; GCN-DAG: s_lshr_b32 ; GCN: v_add_i32_e32 define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { %icmp0 = icmp ugt i8 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index 915f63dbdd477d..0d2f90793fc39a 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -28,7 +28,7 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:vreg_512 = COPY %0 ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) ; CHECK: dead %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec ; CHECK: dead %8:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec ; CHECK: dead %9:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec @@ -52,7 +52,7 @@ body: | %4:vreg_512 = COPY %0 bb.1: - BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec %8:vreg_64 = DS_READ_B64_gfx9 %1, 0, 0, implicit $exec %9:vreg_128 = DS_READ_B128_gfx9 %2, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir index dbe03109b81953..50406a55c1828a 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -1,3 +1,5 @@ +# Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +# Notified per clause 4(b) of the license. # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck %s @@ -25,34 +27,34 @@ body: | ; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec - ; CHECK: undef %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec - ; CHECK: %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec + ; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec ; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec + ; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec + ; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec ; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec - ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec ; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec - ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF2]], implicit $exec - ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, 0, 0, implicit $exec - ; CHECK: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]] - ; CHECK: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec + ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF1]], implicit $exec + ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec + ; CHECK: S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]] + ; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: S_SETREG_IMM32_B32 0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index eee471cb073bf1..b954b778dc6531 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -279,10 +279,10 @@ body: | %80:vgpr_32 = IMPLICIT_DEF %81:vgpr_32 = IMPLICIT_DEF %84:vgpr_32 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, 0, implicit $exec %85:vgpr_32 = IMPLICIT_DEF %86:vgpr_32 = IMPLICIT_DEF %87:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir index a72a406ff094f8..e52b955ffaf711 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -30,14 +30,14 @@ body: | %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec - %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, implicit $exec + %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, 0, implicit $exec %39:vgpr_32 = V_MUL_LO_U32 %15, %15, implicit $exec undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec - %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, implicit $exec + %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, 0, implicit $exec %40:vgpr_32 = V_MUL_LO_U32 %19, %19, implicit $exec %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir index b46bee82210b42..1d567a1a11b60b 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir @@ -1,3 +1,5 @@ +# Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +# Notified per clause 4(b) of the license. # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=machine-scheduler -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s @@ -19,10 +21,10 @@ body: | ; GCN-LABEL: name: handleMove_bundle ; GCN: liveins: $sgpr4_sgpr5 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN: $vcc_hi = IMPLICIT_DEF - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vcc_hi = IMPLICIT_DEF ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store 4, addrspace 3) ; GCN: $m0 = S_MOV_B32 0 diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll index 3dc547dd63c87f..76e8d85fd65107 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s @@ -181,8 +183,8 @@ define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 { } ; GCN-LABEL: {{^}}add_select_negliteralk_fabs_f32: -; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: buffer_load_dword [[Y:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000 ; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0 @@ -367,9 +369,9 @@ define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 { } ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32: -; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] @@ -385,9 +387,9 @@ define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 { } ; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32: -; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: buffer_load_dword [[Y:v[0-9]+]] -; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 163f919b8ae801..6fffd8b7ea567b 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI @@ -31,13 +33,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrs ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_64: @@ -48,13 +50,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -166,13 +168,13 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrs ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_64_sub_x: @@ -183,13 +185,13 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -226,13 +228,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrs ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_65: @@ -243,13 +245,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffbf, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffbf, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -286,13 +288,13 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrs ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_65_sub_x: @@ -303,13 +305,13 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 0x41, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 0x41, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -346,13 +348,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 ad ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg16: @@ -363,13 +365,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 ad ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -406,13 +408,13 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 ad ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: @@ -423,13 +425,13 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 ad ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, -16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, -16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -466,13 +468,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 ad ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg17: @@ -483,13 +485,13 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 ad ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 17, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 17, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -526,13 +528,13 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 ad ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: @@ -543,13 +545,13 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 ad ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 0xffffffef, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 0xffffffef, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -621,13 +623,13 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrs ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 +; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64: @@ -638,13 +640,13 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrs ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 +; GFX9-NEXT: global_store_short v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -756,20 +758,20 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 64 +; VI-NEXT: v_mov_b32_e32 v4, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffffc0, v0 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: @@ -780,13 +782,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -823,38 +825,38 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 64 +; VI-NEXT: v_mov_b32_e32 v4, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, -7, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, -7, v0 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x400007 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s0, 0x400007 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -891,38 +893,38 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffff85 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffffc0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x7b0040 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s0, 0x7b0040 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -963,15 +965,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, -7, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, -7, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: @@ -982,13 +984,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 7 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1022,19 +1024,19 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, -16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: @@ -1045,13 +1047,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1084,19 +1086,19 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0: @@ -1107,13 +1109,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, -4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1146,19 +1148,19 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0xffffbc00 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0: @@ -1169,13 +1171,13 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* % ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1213,20 +1215,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 +; VI-NEXT: v_mov_b32_e32 v4, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffffe0, v0 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: @@ -1237,13 +1239,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1276,19 +1278,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 32 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: @@ -1299,13 +1301,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %o ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1345,15 +1347,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: @@ -1364,13 +1366,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1408,20 +1410,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 +; VI-NEXT: v_mov_b32_e32 v4, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, -16, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, -16, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: @@ -1432,13 +1434,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1471,19 +1473,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, -16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: @@ -1494,13 +1496,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %o ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1540,15 +1542,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, -16, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, -16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: @@ -1559,13 +1561,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1602,20 +1604,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffc400 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffc400, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffc400, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone: @@ -1626,13 +1628,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1669,20 +1671,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x4400 +; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 4.0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 4.0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone: @@ -1693,13 +1695,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -1.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1736,20 +1738,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 2.0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 2.0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: @@ -1760,13 +1762,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1803,20 +1805,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffc000 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffc000, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffc000, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: @@ -1827,13 +1829,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1867,18 +1869,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 32 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: @@ -1889,13 +1891,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1933,13 +1935,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v2, 32, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef: @@ -1950,13 +1952,13 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir index 2aaf7f10b69a7b..42c42a48a6169d 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -81,11 +81,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_ADD_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -165,11 +165,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUB_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -249,11 +249,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUBREV_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -332,12 +332,12 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %9 = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, %9, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -417,12 +417,12 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec $vcc = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -502,11 +502,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, $vcc = V_ADDC_U32_e64 %19, %17, undef $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index e9a6ba98942616..8460d2832a3a12 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s @@ -267,9 +269,9 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index 70c1ad96234458..4f6b2c973c5fc3 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI @@ -294,9 +296,9 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -309,9 +311,9 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 184082dd36250d..635da5cc8572ee 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s @@ -32,12 +34,12 @@ define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> % ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] -; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_and_b32 diff --git a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll index 823785dc752dc1..65100f627f1eb5 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -1,10 +1,12 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s ; GCN-FUNC: {{^}}vccz_workaround: -; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 -; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0{{$}} +; GCN: s_load_dword [[REG:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], +; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, [[REG]], 0{{$}} ; VCCZ-BUG: s_waitcnt lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index f5857a330fab13..93a9df17e397c6 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,HAWAII %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FIJI %s @@ -8,10 +10,10 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 ; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; @@ -103,9 +105,9 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: ds_write_b16 v0, v2 offset:4 ; HAWAII-NEXT: ds_write_b32 v0, v1 ; HAWAII-NEXT: s_endpgm ; @@ -117,9 +119,9 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: ds_write_b16 v0, v2 offset:4 ; FIJI-NEXT: ds_write_b32 v0, v1 ; FIJI-NEXT: s_endpgm ; @@ -130,10 +132,10 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v2 ; GFX9-NEXT: s_endpgm store i48 %arg, i48 addrspace(3)* %ptr, align 8 ret void @@ -148,11 +150,11 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: s_and_b32 s0, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s0 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 ; HAWAII-NEXT: ds_write_b64 v2, v[0:1] ; HAWAII-NEXT: s_endpgm ; @@ -164,11 +166,11 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: s_and_b32 s0, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s0 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 ; FIJI-NEXT: ds_write_b64 v2, v[0:1] ; FIJI-NEXT: s_endpgm ; @@ -180,9 +182,9 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm @@ -215,10 +217,10 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; CIVI-NEXT: s_mov_b32 m0, -1 ; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index c7a08a6651156f..0029194d4afc67 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI @@ -7,7 +9,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -15,8 +17,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -30,7 +32,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -38,8 +40,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] @@ -166,42 +168,42 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -216,42 +218,42 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0x3df -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -265,41 +267,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 1 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 1, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -313,40 +315,40 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -361,41 +363,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 1.0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -411,7 +413,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -419,8 +421,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -436,7 +438,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -444,8 +446,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] @@ -473,7 +475,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -481,14 +483,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v[4:5], off ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 @@ -500,26 +502,26 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v4, v[4:5] -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v0, v2, v4 -; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v2 +; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -539,7 +541,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -547,8 +549,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -564,7 +566,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -572,8 +574,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] @@ -603,7 +605,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -611,8 +613,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off @@ -631,7 +633,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -639,8 +641,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: flat_load_dword v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index ff3e837235f053..1d6370c9e5c0e0 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI ; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI @@ -106,13 +108,13 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(<2 x i16> addrspace( ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[4:5], 0x0 -; VI-NEXT: s_load_dword s3, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s1, s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s2 -; VI-NEXT: s_sext_i32_i16 s1, s3 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_sext_i32_i16 s1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 41444b0ef0cde6..330426fb0ad13b 100644 --- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -88,7 +88,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -96,7 +96,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3.done: @@ -104,7 +104,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... @@ -149,7 +149,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -157,7 +157,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3.done: @@ -165,7 +165,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 145a18177a1e1c..af63a074eb3985 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s @@ -116,9 +118,9 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -134,11 +136,11 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -424,15 +426,15 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -446,11 +448,11 @@ define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v3, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -464,16 +466,16 @@ define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -807,10 +809,10 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir index 9d45c5b19e656d..10ed241acb585f 100644 --- a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir @@ -11,7 +11,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_smem_write_sgpr @@ -25,7 +25,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 ... # GCN-LABEL: name: vmem_snop_write_sgpr @@ -40,7 +40,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_NOP 0 $sgpr0 = S_MOV_B32 0 ... @@ -55,7 +55,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec $sgpr0 = S_MOV_B32 0 ... @@ -70,7 +70,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 0 $sgpr0 = S_MOV_B32 0 ... @@ -86,7 +86,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 1 $sgpr0 = S_MOV_B32 0 ... @@ -101,7 +101,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: vmem_write_exec_expread @@ -114,7 +114,7 @@ body: | bb.0: $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: ds_write_m0 @@ -143,7 +143,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $sgpr0 = S_MOV_B32 0 @@ -161,7 +161,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -181,7 +181,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.2 bb.1: @@ -206,7 +206,7 @@ body: | $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... # GCN-LABEL: name: ds_write_exec diff --git a/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir index 644651ded335c1..5dbe5d58d9bc35 100644 --- a/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir @@ -19,7 +19,7 @@ body: | $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_to_next # GCN: bb.1: @@ -40,7 +40,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far # GCN: bb.1: @@ -61,7 +61,7 @@ body: | $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops # GCN: bb.1: @@ -78,7 +78,7 @@ body: | S_NOP 4 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_around # GCN: bb.2: @@ -107,7 +107,7 @@ body: | S_NOP 0 bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_backedge # GCN: S_NOP @@ -123,7 +123,7 @@ body: | $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $vgpr0 = IMPLICIT_DEF @@ -156,7 +156,7 @@ body: | $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_self_loop # GCN: S_NOP @@ -172,7 +172,7 @@ body: | $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.0 ... @@ -198,7 +198,7 @@ body: | successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... @@ -224,7 +224,7 @@ body: | successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll index 1f0a66f9165613..e3304d35d39227 100644 --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT ; RUN: llc -march=amdgcn --misched=ilpmax -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX @@ -13,7 +15,7 @@ ; DEFAULT: buffer_load_format_xyzw ; DEFAULT: s_waitcnt vmcnt(0) ; DEFAULT: exp -; DEFAULT-NEXT: exp +; DEFAULT: exp ; DEFAULT-NEXT: s_endpgm define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir index d6e983ae5904c9..bfd92347d92b6e 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir @@ -78,7 +78,7 @@ body: | bb.1: successors: %bb.2 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: successors: %bb.3, %bb.6 @@ -86,7 +86,7 @@ body: | bb.3: successors: %bb.4, %bb.5 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_VCCNZ %bb.5, implicit $vcc bb.4: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 85c79144d0e52d..fbe52f766aa84f 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s @@ -516,14 +518,14 @@ end: } ; GCN-LABEL: {{^}}test_preserve_condition_undef_flag: -; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 -; GFX1032: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0 +; GFX1032-DAG: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 +; GFX1032-DAG: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0 ; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 ; GFX1032: s_or_b32 [[OR1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} ; GFX1032: s_or_b32 [[OR2:s[0-9]+]], [[OR1]], s{{[0-9]+}} ; GFX1032: s_and_b32 vcc_lo, exec_lo, [[OR2]] -; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 -; GFX1064: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0 +; GFX1064-DAG: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 +; GFX1064-DAG: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0 ; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 ; GFX1064: s_or_b64 [[OR1:s\[[0-9:]+\]]], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GFX1064: s_or_b64 [[OR2:s\[[0-9:]+\]]], [[OR1]], s[{{[0-9:]+}}] diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 4beebc0b34a268..2e1ee43fea0d19 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s @@ -272,17 +274,17 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4) ; VI-LABEL: no_widen_i16_constant_divergent_load: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v2 -; VI-NEXT: v_or_b32_e32 v2, 4, v2 +; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0 +; VI-NEXT: v_or_b32_e32 v2, 4, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll index e29e67f77d2e7c..36df31df2926a4 100644 --- a/llvm/test/CodeGen/AMDGPU/xor3.ll +++ b/llvm/test/CodeGen/AMDGPU/xor3.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s @@ -155,12 +157,12 @@ define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float ; ; GFX10-LABEL: xor3_uniform_vgpr: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0 -; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0 -; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4 +; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %a1 = fadd float %a, 1.0 %b2 = fadd float %b, 2.0 diff --git a/llvm/test/CodeGen/ARM/r7-fixed-darwin.ll b/llvm/test/CodeGen/ARM/r7-fixed-darwin.ll new file mode 100644 index 00000000000000..dc59b6acb42141 --- /dev/null +++ b/llvm/test/CodeGen/ARM/r7-fixed-darwin.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=thumbv7k-apple-watchos %s -o - | FileCheck %s + +; r7 is FP on Darwin, and should be preserved even if we don't create a new +; frame record for this leaf function. So make huge register pressure to try & +; tempt LLVM to use it. +define void @foo([16 x i32]* %ptr) { +; CHECK-LABEL: foo: +; CHECK: push.w +; CHECK: .cfi_offset r7 +; CHECK-NOT: r7 +; CHECK: pop.w + %val = load volatile [16 x i32], [16 x i32]* %ptr + store volatile [16 x i32] %val, [16 x i32]* %ptr + ret void +} diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll new file mode 100644 index 00000000000000..0f75cd81242151 --- /dev/null +++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll @@ -0,0 +1,127 @@ +; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; Source code: +; struct t1 { +; int c; +; }; +; struct s1 { +; struct t1 b; +; }; +; struct r1 { +; struct s1 a; +; }; +; #define _(x) __builtin_preserve_access_index(x) +; void test1(void *p1, void *p2, void *p3); +; void test(struct r1 *arg) { +; struct s1 *ps = _(&arg->a); +; struct t1 *pt = _(&arg->a.b); +; int *pi = _(&arg->a.b.c); +; test1(ps, pt, pi); +; } +; Compilation flag: +; clang -target bpf -O2 -g -S -emit-llvm test.c + +%struct.r1 = type { %struct.s1 } +%struct.s1 = type { %struct.t1 } +%struct.t1 = type { i32 } + +; Function Attrs: nounwind +define dso_local void @test(%struct.r1* %arg) local_unnamed_addr #0 !dbg !7 { +entry: + call void @llvm.dbg.value(metadata %struct.r1* %arg, metadata !22, metadata !DIExpression()), !dbg !29 + %0 = tail call %struct.s1* @llvm.preserve.struct.access.index.p0s_struct.s1s.p0s_struct.r1s(%struct.r1* %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11 + call void @llvm.dbg.value(metadata %struct.s1* %0, metadata !23, metadata !DIExpression()), !dbg !29 + %1 = tail call %struct.t1* @llvm.preserve.struct.access.index.p0s_struct.t1s.p0s_struct.s1s(%struct.s1* %0, i32 0, i32 0), !dbg !31, !llvm.preserve.access.index !14 + call void @llvm.dbg.value(metadata %struct.t1* %1, metadata !25, metadata !DIExpression()), !dbg !29 + %2 = tail call i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.t1s(%struct.t1* %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17 + call void @llvm.dbg.value(metadata i32* %2, metadata !27, metadata !DIExpression()), !dbg !29 + %3 = bitcast %struct.s1* %0 to i8*, !dbg !33 + %4 = bitcast %struct.t1* %1 to i8*, !dbg !34 + %5 = bitcast i32* %2 to i8*, !dbg !35 + tail call void @test1(i8* %3, i8* %4, i8* %5) #4, !dbg !36 + ret void, !dbg !37 +} + +; CHECK: .long 1 # BTF_KIND_STRUCT(id = 2) + +; CHECK: .ascii "r1" # string offset=1 +; CHECK: .ascii ".text" # string offset=29 +; CHECK: .ascii "0:0" # string offset=72 +; CHECK: .ascii "0:0:0" # string offset=76 +; CHECK: .ascii "0:0:0:0" # string offset=82 + +; CHECK: .long 12 # OffsetReloc +; CHECK-NEXT: .long 29 # Offset reloc section string offset=29 +; CHECK-NEXT: .long 3 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 72 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 76 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 82 + +; Function Attrs: nounwind readnone +declare %struct.s1* @llvm.preserve.struct.access.index.p0s_struct.s1s.p0s_struct.r1s(%struct.r1*, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare %struct.t1* @llvm.preserve.struct.access.index.p0s_struct.t1s.p0s_struct.s1s(%struct.s1*, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.t1s(%struct.t1*, i32, i32) #1 + +declare dso_local void @test1(i8*, i8*, i8*) local_unnamed_addr #2 + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.value(metadata, metadata, metadata) #3 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind readnone speculatable willreturn } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0 (https://github.com/llvm/llvm-project.git 42b3328a2368b38fba6bdb0c616fe6c5520e3bc5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/work/tests/core") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 42b3328a2368b38fba6bdb0c616fe6c5520e3bc5)"} +!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 12, type: !8, scopeLine: 12, flags: DIFlagPrototyped, isDefinition: true, isOptimized: true, unit: !0, retainedNodes: !21) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "r1", file: !1, line: 7, size: 32, elements: !12) +!12 = !{!13} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !11, file: !1, line: 8, baseType: !14, size: 32) +!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s1", file: !1, line: 4, size: 32, elements: !15) +!15 = !{!16} +!16 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !14, file: !1, line: 5, baseType: !17, size: 32) +!17 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: !1, line: 1, size: 32, elements: !18) +!18 = !{!19} +!19 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !17, file: !1, line: 2, baseType: !20, size: 32) +!20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!21 = !{!22, !23, !25, !27} +!22 = !DILocalVariable(name: "arg", arg: 1, scope: !7, file: !1, line: 12, type: !10) +!23 = !DILocalVariable(name: "ps", scope: !7, file: !1, line: 13, type: !24) +!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64) +!25 = !DILocalVariable(name: "pt", scope: !7, file: !1, line: 14, type: !26) +!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 64) +!27 = !DILocalVariable(name: "pi", scope: !7, file: !1, line: 15, type: !28) +!28 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64) +!29 = !DILocation(line: 0, scope: !7) +!30 = !DILocation(line: 13, column: 19, scope: !7) +!31 = !DILocation(line: 14, column: 19, scope: !7) +!32 = !DILocation(line: 15, column: 13, scope: !7) +!33 = !DILocation(line: 16, column: 9, scope: !7) +!34 = !DILocation(line: 16, column: 13, scope: !7) +!35 = !DILocation(line: 16, column: 17, scope: !7) +!36 = !DILocation(line: 16, column: 3, scope: !7) +!37 = !DILocation(line: 17, column: 1, scope: !7) diff --git a/llvm/test/CodeGen/Hexagon/pipeliner/swp-phi-start.mir b/llvm/test/CodeGen/Hexagon/pipeliner/swp-phi-start.mir index 5b953f13b1b48e..aa553dccc7de74 100644 --- a/llvm/test/CodeGen/Hexagon/pipeliner/swp-phi-start.mir +++ b/llvm/test/CodeGen/Hexagon/pipeliner/swp-phi-start.mir @@ -1,4 +1,4 @@ -# RUN: llc < %s -x mir -march=hexagon -run-pass=modulo-schedule-test | FileCheck %s +# RUN: llc < %s -x mir -march=hexagon -run-pass=modulo-schedule-test -pipeliner-experimental-cg=true | FileCheck %s # Simple check for this sanity test; ensure all instructions are in stage 0 in # the prolog and stage 3 in the epilog. diff --git a/llvm/test/CodeGen/Hexagon/swp-art-deps-rec.ll b/llvm/test/CodeGen/Hexagon/swp-art-deps-rec.ll index 5272faf8f9b8ab..f89df6e55734d3 100644 --- a/llvm/test/CodeGen/Hexagon/swp-art-deps-rec.ll +++ b/llvm/test/CodeGen/Hexagon/swp-art-deps-rec.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts ; RUN: llc -march=hexagon -mcpu=hexagonv65 -O3 -debug-only=pipeliner \ -; RUN: < %s 2>&1 | FileCheck %s +; RUN: < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s ; Test that the artificial dependences are ignored while computing the ; circuits. diff --git a/llvm/test/CodeGen/Hexagon/swp-bad-sched.ll b/llvm/test/CodeGen/Hexagon/swp-bad-sched.ll index 74f0647a72a4ed..ee93e8b3468e2d 100644 --- a/llvm/test/CodeGen/Hexagon/swp-bad-sched.ll +++ b/llvm/test/CodeGen/Hexagon/swp-bad-sched.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc -march=hexagon -enable-pipeliner -enable-aa-sched-mi < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -enable-aa-sched-mi < %s -pipeliner-experimental-cg=true | FileCheck %s ; CHECK: loop0( ; CHECK: loop0(.LBB0_[[LOOP:.]], diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll index b33cf522115b23..e5b5be4d430048 100644 --- a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct code when a loop carried value ; is scheduled one stage earlier than it's use. The code in diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir index 8271e8b1b54eee..d9db8ec6194c8a 100644 --- a/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir +++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s # REQUIRES: asserts # Test that the loop carried dependence check correctly identifies a recurrence. diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir index 126e6aa462b4e1..5271a2db7758e7 100644 --- a/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir +++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s # REQUIRES: asserts # Test that the loop carried dependence check correctly identifies a recurrence diff --git a/llvm/test/CodeGen/Hexagon/swp-chain-refs.ll b/llvm/test/CodeGen/Hexagon/swp-chain-refs.ll index d0e72be778a1c9..5695f3d61b7401 100644 --- a/llvm/test/CodeGen/Hexagon/swp-chain-refs.ll +++ b/llvm/test/CodeGen/Hexagon/swp-chain-refs.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=hexagon -enable-pipeliner=true -stats -o /dev/null < %s \ -; RUN: 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; Test that we do not schedule chained references too far apart, diff --git a/llvm/test/CodeGen/Hexagon/swp-change-dep1.ll b/llvm/test/CodeGen/Hexagon/swp-change-dep1.ll index 855f43e50125df..157bdd069f921c 100644 --- a/llvm/test/CodeGen/Hexagon/swp-change-dep1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-change-dep1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=1 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=1 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we update the offset correctly for loads that are ; moved past stores. In these cases, we change the dependences diff --git a/llvm/test/CodeGen/Hexagon/swp-change-deps.ll b/llvm/test/CodeGen/Hexagon/swp-change-deps.ll index e2ca071f5f5fb8..1b35c633c52de4 100644 --- a/llvm/test/CodeGen/Hexagon/swp-change-deps.ll +++ b/llvm/test/CodeGen/Hexagon/swp-change-deps.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct offsets for loads in the prolog ; after removing dependences on a post-increment instructions of the diff --git a/llvm/test/CodeGen/Hexagon/swp-check-offset.ll b/llvm/test/CodeGen/Hexagon/swp-check-offset.ll index 220ebde0f86d6f..6a7211df12d8f3 100644 --- a/llvm/test/CodeGen/Hexagon/swp-check-offset.ll +++ b/llvm/test/CodeGen/Hexagon/swp-check-offset.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv62 -enable-pipeliner < %s | FileCheck --check-prefix=CHECK-V62 %s -; RUN: llc -march=hexagon -mcpu=hexagonv65 -enable-pipeliner < %s | FileCheck --check-prefix=CHECK-V65 %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv62 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck --check-prefix=CHECK-V62 %s +; RUN: llc -march=hexagon -mcpu=hexagonv65 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck --check-prefix=CHECK-V65 %s ; ; Make sure we pipeline the loop and that we generate the correct diff --git a/llvm/test/CodeGen/Hexagon/swp-const-tc1.ll b/llvm/test/CodeGen/Hexagon/swp-const-tc1.ll index 95dfc37e306206..c785ee74513a91 100644 --- a/llvm/test/CodeGen/Hexagon/swp-const-tc1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-const-tc1.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=hexagon -enable-pipeliner -enable-pipeliner-opt-size \ ; RUN: -verify-machineinstrs -hexagon-initial-cfg-cleanup=0 \ ; RUN: -enable-aa-sched-mi=false -hexagon-expand-condsets=0 \ -; RUN: < %s | FileCheck %s +; RUN: < %s -pipeliner-experimental-cg=true | FileCheck %s ; Disable expand-condsets because it will assert on undefined registers. diff --git a/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll b/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll index 8b9f87f428d65d..29d12bd14390dc 100644 --- a/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll +++ b/llvm/test/CodeGen/Hexagon/swp-const-tc2.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we fixup a pipelined loop correctly when the number of ; stages is greater than the compile-time loop trip count. In this diff --git a/llvm/test/CodeGen/Hexagon/swp-const-tc3.ll b/llvm/test/CodeGen/Hexagon/swp-const-tc3.ll index a8caebd09eb11c..48a61428538c23 100644 --- a/llvm/test/CodeGen/Hexagon/swp-const-tc3.ll +++ b/llvm/test/CodeGen/Hexagon/swp-const-tc3.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner correctly fixes up the pipelined CFG when the loop ; has a constant trip count, and the trip count is less than the number of diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll index 48f33bd6d22ccc..d14177cc684f25 100644 --- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; XFAIL: * ; LSR changes required. diff --git a/llvm/test/CodeGen/Hexagon/swp-copytophi-dag.ll b/llvm/test/CodeGen/Hexagon/swp-copytophi-dag.ll index 69743407c148ca..f511241a7c73b6 100644 --- a/llvm/test/CodeGen/Hexagon/swp-copytophi-dag.ll +++ b/llvm/test/CodeGen/Hexagon/swp-copytophi-dag.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts ; ; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \ -; RUN: 2>&1 | FileCheck %s +; RUN: 2>&1 -pipeliner-experimental-cg=true | FileCheck %s ; Test that the artificial dependence is created as a result of ; CopyToPhi DAG mutation. diff --git a/llvm/test/CodeGen/Hexagon/swp-dep-neg-offset.ll b/llvm/test/CodeGen/Hexagon/swp-dep-neg-offset.ll index 7ba4286bf41441..cc19ce1ae44178 100644 --- a/llvm/test/CodeGen/Hexagon/swp-dep-neg-offset.ll +++ b/llvm/test/CodeGen/Hexagon/swp-dep-neg-offset.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the code that changes the dependences does not allow ; a load with a negative offset to be overlapped with the post diff --git a/llvm/test/CodeGen/Hexagon/swp-disable-Os.ll b/llvm/test/CodeGen/Hexagon/swp-disable-Os.ll index cbdc3ba36f6500..5698d37cb23ba4 100644 --- a/llvm/test/CodeGen/Hexagon/swp-disable-Os.ll +++ b/llvm/test/CodeGen/Hexagon/swp-disable-Os.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; CHECK: loop0(.LBB0_{{[0-9]+}},#347) target triple = "hexagon" diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-numphis.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-numphis.ll index a54ac5825601db..f57f94bf03cecd 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-numphis.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-numphis.ll @@ -1,6 +1,6 @@ ; XFAIL: * ; Needs some fixed in the pipeliner. -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; CHECK: endloop0 ; CHECK: vmem diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi2.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi2.ll index b2a7dada33f41a..b32fed97f26f19 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi2.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi2.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=3 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=3 < %s -pipeliner-experimental-cg=true | FileCheck %s %s.0 = type { i16, i8, i8, i16, i8, i8, i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i32, i16, i8, i8, %s.1, [2 x [16 x %s.2]], i32 (i8*, i8*, i8*, i8*, i8*)*, %s.3*, %s.3*, [120 x i8], i8, i8, %s.4*, [2 x [120 x [8 x i8]]], [56 x i8], [2 x [121 x %s.5]], [2 x %s.5], %s.5*, %s.5*, i32, i32, i16, i8, i8, %s.7, %s.9, %s.11, %s.8*, %s.8* } %s.1 = type { i8, i8, i8, i8, i8, i8, i8, i8, i32, i8, [16 x i8], i8, [4 x i8], [32 x i16], [32 x i16], [2 x i8], [4 x i8], [2 x [4 x i8]], [2 x [4 x i8]], i32, i32, i16, i8 } diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi4.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi4.ll index e85ea7654e0559..8b611cfe0b4f2f 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi4.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi4.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct value for a Phi in the epilog ; that is for a value defined two stages earlier. An extra copy in the diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll index a524dc0d5bedf1..72c05284d6953f 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi5.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we use the correct name in an epilog phi for a phi value ; that is defined for the last time in the kernel. Previously, we diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi8.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi8.ll index 370d31d92c707f..214307e25137de 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi8.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi8.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mno-pairing -mno-compound -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -mno-pairing -mno-compound -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; XFAIL: * ; Test that we generate the correct phi names in the epilog when the pipeliner diff --git a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll index 681e7492337858..c87479f6e97db2 100644 --- a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct names for the phis in the kernel for the ; incoming values. In this case, the loop contains a phi and has another phi diff --git a/llvm/test/CodeGen/Hexagon/swp-large-rec.ll b/llvm/test/CodeGen/Hexagon/swp-large-rec.ll index ee88aaffd5c01e..45d40df4ec0ae1 100644 --- a/llvm/test/CodeGen/Hexagon/swp-large-rec.ll +++ b/llvm/test/CodeGen/Hexagon/swp-large-rec.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=hexagon -enable-pipeliner -stats \ ; RUN: -pipeliner-prune-loop-carried=false -fp-contract=fast \ -; RUN: -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; That that we do not pipeline this loop. The recurrence is too large. If diff --git a/llvm/test/CodeGen/Hexagon/swp-listen-loop3.ll b/llvm/test/CodeGen/Hexagon/swp-listen-loop3.ll index 9b68cf956591e3..d8e4f003d53758 100644 --- a/llvm/test/CodeGen/Hexagon/swp-listen-loop3.ll +++ b/llvm/test/CodeGen/Hexagon/swp-listen-loop3.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -pipeliner-ignore-recmii -pipeliner-max-stages=2 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -pipeliner-ignore-recmii -pipeliner-max-stages=2 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; This is a loop we pipeline to three packets, though we could do bettter. diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll b/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll index b95e62419567b2..9f145189786ef8 100644 --- a/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll +++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner schedules a store before the load in which there is a ; loop carried dependence. Previously, the loop carried dependence wasn't added diff --git a/llvm/test/CodeGen/Hexagon/swp-lots-deps.ll b/llvm/test/CodeGen/Hexagon/swp-lots-deps.ll index a657f92c5d5a0a..631d02649d11e5 100644 --- a/llvm/test/CodeGen/Hexagon/swp-lots-deps.ll +++ b/llvm/test/CodeGen/Hexagon/swp-lots-deps.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; STATS: 1 pipeliner - Number of loops software pipelined diff --git a/llvm/test/CodeGen/Hexagon/swp-max.ll b/llvm/test/CodeGen/Hexagon/swp-max.ll index 26238ea6fb3744..32282204ec52ab 100644 --- a/llvm/test/CodeGen/Hexagon/swp-max.ll +++ b/llvm/test/CodeGen/Hexagon/swp-max.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \ -; RUN: -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s @A = global [8 x i32] [i32 4, i32 -3, i32 5, i32 -2, i32 -1, i32 2, i32 6, i32 -2], align 8 diff --git a/llvm/test/CodeGen/Hexagon/swp-maxstart.ll b/llvm/test/CodeGen/Hexagon/swp-maxstart.ll index 811c94062a0f04..8d65e76913f36f 100644 --- a/llvm/test/CodeGen/Hexagon/swp-maxstart.ll +++ b/llvm/test/CodeGen/Hexagon/swp-maxstart.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -O3 < %s | FileCheck %s +; RUN: llc -march=hexagon -O3 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the MinStart computation, which is based upon the length ; of the chain edges, is computed correctly. A bug in the code allowed diff --git a/llvm/test/CodeGen/Hexagon/swp-memrefs-epilog.ll b/llvm/test/CodeGen/Hexagon/swp-memrefs-epilog.ll index 81f4d22cfd5430..20e39dd08fd727 100644 --- a/llvm/test/CodeGen/Hexagon/swp-memrefs-epilog.ll +++ b/llvm/test/CodeGen/Hexagon/swp-memrefs-epilog.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -O2 -fp-contract=fast < %s | FileCheck %s +; RUN: llc -march=hexagon -O2 -fp-contract=fast < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the memoperands for instructions in the epilog are updated ; correctly. Previously, the pipeliner updated the offset for the memoperands diff --git a/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll b/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll index fc2576af8ac2c5..5a2e7d4e14d5b3 100644 --- a/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll +++ b/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; Make sure we attempt to pipeline all inner most loops. diff --git a/llvm/test/CodeGen/Hexagon/swp-new-phi.ll b/llvm/test/CodeGen/Hexagon/swp-new-phi.ll index 0ba3e30731abc3..d3c1058fe36cc3 100644 --- a/llvm/test/CodeGen/Hexagon/swp-new-phi.ll +++ b/llvm/test/CodeGen/Hexagon/swp-new-phi.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the generatePhi code doesn't rename a a Phi instruction that's defined ; in the same block. The bug causes a Phi to incorrectly depend on another Phi. diff --git a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll index 5de0717654ffa6..0a017c4ab7f6f7 100644 --- a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll +++ b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the instruction ordering code in the pipeliner fixes up dependences ; between post-increment register definitions and uses so that the register diff --git a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll index d1d852bcae778a..4cd29a4a0baf88 100644 --- a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll +++ b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner cause an assert and correctly pipelines the ; loop. diff --git a/llvm/test/CodeGen/Hexagon/swp-order.ll b/llvm/test/CodeGen/Hexagon/swp-order.ll index bc16b8835b7d8a..14cc682eb7e57f 100644 --- a/llvm/test/CodeGen/Hexagon/swp-order.ll +++ b/llvm/test/CodeGen/Hexagon/swp-order.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that when we order instructions in a packet we check for ; order dependences so that the source of an order dependence diff --git a/llvm/test/CodeGen/Hexagon/swp-phi-ch-offset.ll b/llvm/test/CodeGen/Hexagon/swp-phi-ch-offset.ll index 68cb69ba2ac4db..31b98328a2facb 100644 --- a/llvm/test/CodeGen/Hexagon/swp-phi-ch-offset.ll +++ b/llvm/test/CodeGen/Hexagon/swp-phi-ch-offset.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct offsets after we removed unneeded ; chain dependences between Phis and generated a better pipeline. diff --git a/llvm/test/CodeGen/Hexagon/swp-phi-chains.ll b/llvm/test/CodeGen/Hexagon/swp-phi-chains.ll index e3a6c9db616ec6..3037dcc2d5e177 100644 --- a/llvm/test/CodeGen/Hexagon/swp-phi-chains.ll +++ b/llvm/test/CodeGen/Hexagon/swp-phi-chains.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -debug-only=pipeliner < %s -o - 2>&1 | FileCheck %s +; RUN: llc -march=hexagon -debug-only=pipeliner < %s -o - 2>&1 -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; Test that there is a chain edge between two dependent Phis. diff --git a/llvm/test/CodeGen/Hexagon/swp-phi-dep.ll b/llvm/test/CodeGen/Hexagon/swp-phi-dep.ll index 38b56c1126a009..ec7af41a31fa7c 100644 --- a/llvm/test/CodeGen/Hexagon/swp-phi-dep.ll +++ b/llvm/test/CodeGen/Hexagon/swp-phi-dep.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -enable-pipeliner -pipeliner-max-stages=2 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -enable-pipeliner -pipeliner-max-stages=2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Check that the pipelined code uses the proper address in the ; prolog and the kernel. The bug occurs when the address computation diff --git a/llvm/test/CodeGen/Hexagon/swp-phi-ref.ll b/llvm/test/CodeGen/Hexagon/swp-phi-ref.ll index d39252141d2eb5..be838e767aa071 100644 --- a/llvm/test/CodeGen/Hexagon/swp-phi-ref.ll +++ b/llvm/test/CodeGen/Hexagon/swp-phi-ref.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -enable-bsb-sched=0 -join-liveintervals=false < %s -pipeliner-experimental-cg=true | FileCheck %s ; XFAIL: * ; This test is failing after post-ra machine sinking. diff --git a/llvm/test/CodeGen/Hexagon/swp-pragma-disable.ii b/llvm/test/CodeGen/Hexagon/swp-pragma-disable.ii index 80494f5ac10deb..b97065c129b0f4 100644 --- a/llvm/test/CodeGen/Hexagon/swp-pragma-disable.ii +++ b/llvm/test/CodeGen/Hexagon/swp-pragma-disable.ii @@ -1,5 +1,5 @@ ; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \ -; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s +; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; ; Test that checks if pipeliner disabled by pragma diff --git a/llvm/test/CodeGen/Hexagon/swp-pragma-initiation-interval.ii b/llvm/test/CodeGen/Hexagon/swp-pragma-initiation-interval.ii index 6a4ba7eaccd313..2c6b606a99f730 100644 --- a/llvm/test/CodeGen/Hexagon/swp-pragma-initiation-interval.ii +++ b/llvm/test/CodeGen/Hexagon/swp-pragma-initiation-interval.ii @@ -1,5 +1,5 @@ ; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \ -; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s +; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; ; Test that checks if the II set by pragma was taken by pipeliner. diff --git a/llvm/test/CodeGen/Hexagon/swp-prolog-phi.ll b/llvm/test/CodeGen/Hexagon/swp-prolog-phi.ll index 4a6fa5a6cfe738..14b04a1cfe6477 100644 --- a/llvm/test/CodeGen/Hexagon/swp-prolog-phi.ll +++ b/llvm/test/CodeGen/Hexagon/swp-prolog-phi.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -rdf-opt=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -rdf-opt=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct name for a value in a prolog block. The ; pipeliner was using an incorrect value for an instruction in the 2nd prolog diff --git a/llvm/test/CodeGen/Hexagon/swp-rename.ll b/llvm/test/CodeGen/Hexagon/swp-rename.ll index ab0cc11ec7e9e2..eb60a0e38d0e7e 100644 --- a/llvm/test/CodeGen/Hexagon/swp-rename.ll +++ b/llvm/test/CodeGen/Hexagon/swp-rename.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; A test that the Phi rewrite logic is correct. diff --git a/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll b/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll index 1a9734860d5cb8..1e2eefae6ac84e 100644 --- a/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc -march=hexagon -enable-pipeliner -debug-only=pipeliner < %s -o - 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; Test that checks that we compute the correct ResMII for haar. diff --git a/llvm/test/CodeGen/Hexagon/swp-resmii.ll b/llvm/test/CodeGen/Hexagon/swp-resmii.ll index 851d82ea50f091..99812af3be5d0a 100644 --- a/llvm/test/CodeGen/Hexagon/swp-resmii.ll +++ b/llvm/test/CodeGen/Hexagon/swp-resmii.ll @@ -1,5 +1,5 @@ ; RUN: llc -disable-lsr -march=hexagon -enable-pipeliner \ -; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null | FileCheck %s +; RUN: -debug-only=pipeliner < %s 2>&1 > /dev/null -pipeliner-experimental-cg=true | FileCheck %s ; REQUIRES: asserts ; ; Test that checks if the ResMII is 1. diff --git a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll index 7371ed10a7117d..6883e51503b140 100644 --- a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll +++ b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon < %s | FileCheck %s +; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that the pipeliner generates correct code when attempting to reuse ; an existing phi. This test case contains a phi that references another diff --git a/llvm/test/CodeGen/Hexagon/swp-sigma.ll b/llvm/test/CodeGen/Hexagon/swp-sigma.ll index 56742ca83a71bd..968fafc449d728 100644 --- a/llvm/test/CodeGen/Hexagon/swp-sigma.ll +++ b/llvm/test/CodeGen/Hexagon/swp-sigma.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -O2 < %s | FileCheck %s +; RUN: llc -march=hexagon -O2 < %s -pipeliner-experimental-cg=true | FileCheck %s ; We do not pipeline sigma yet, but the non-pipelined version ; with good scheduling is pretty fast. The compiler generates diff --git a/llvm/test/CodeGen/Hexagon/swp-stages4.ll b/llvm/test/CodeGen/Hexagon/swp-stages4.ll index 2d88094cf740b7..1b96aca2a48c54 100644 --- a/llvm/test/CodeGen/Hexagon/swp-stages4.ll +++ b/llvm/test/CodeGen/Hexagon/swp-stages4.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -disable-block-placement=0 -hexagon-bit=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -disable-block-placement=0 -hexagon-bit=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we rename registers correctly for multiple stages when there is a ; Phi and depends upon another Phi. ; CHECK: = and ; CHECK: = and -; CHECK: = and +; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1) ; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255) ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255) ; CHECK: loop0(.LBB0_[[LOOP:.]], diff --git a/llvm/test/CodeGen/Hexagon/swp-subreg.ll b/llvm/test/CodeGen/Hexagon/swp-subreg.ll index d75b3afc7b72c3..b9754f4eb3628d 100644 --- a/llvm/test/CodeGen/Hexagon/swp-subreg.ll +++ b/llvm/test/CodeGen/Hexagon/swp-subreg.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; We're unable to pipeline a loop with a subreg as an operand of a Phi. diff --git a/llvm/test/CodeGen/Hexagon/swp-swap.ll b/llvm/test/CodeGen/Hexagon/swp-swap.ll index a8432cb7d21872..4cd073cb16b837 100644 --- a/llvm/test/CodeGen/Hexagon/swp-swap.ll +++ b/llvm/test/CodeGen/Hexagon/swp-swap.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; Test that we don't pipeline, incorrectly, the swap operation. diff --git a/llvm/test/CodeGen/Hexagon/swp-tfri.ll b/llvm/test/CodeGen/Hexagon/swp-tfri.ll index 66b999e5590bd1..f0c26045430c24 100644 --- a/llvm/test/CodeGen/Hexagon/swp-tfri.ll +++ b/llvm/test/CodeGen/Hexagon/swp-tfri.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=STATS +; RUN: llc -march=hexagon -enable-pipeliner -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null < %s 2>&1 -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=STATS ; REQUIRES: asserts ; Check that we handle the case when a value is first defined in the loop. diff --git a/llvm/test/CodeGen/Hexagon/swp-vect-dotprod.ll b/llvm/test/CodeGen/Hexagon/swp-vect-dotprod.ll index 3ff88452499e09..4bd1a513429f01 100644 --- a/llvm/test/CodeGen/Hexagon/swp-vect-dotprod.ll +++ b/llvm/test/CodeGen/Hexagon/swp-vect-dotprod.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s -pipeliner-experimental-cg=true | FileCheck %s ; ; Check that we pipeline a vectorized dot product in a single packet. ; diff --git a/llvm/test/CodeGen/Hexagon/swp-vmult.ll b/llvm/test/CodeGen/Hexagon/swp-vmult.ll index dfc7dd913242b2..fd9cdf9b38c90e 100644 --- a/llvm/test/CodeGen/Hexagon/swp-vmult.ll +++ b/llvm/test/CodeGen/Hexagon/swp-vmult.ll @@ -1,5 +1,5 @@ ; REQUIRES: to-be-fixed -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s ; Multiply and accumulate ; CHECK: mpyi([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]]) diff --git a/llvm/test/CodeGen/Hexagon/swp-vsum.ll b/llvm/test/CodeGen/Hexagon/swp-vsum.ll index 1c4d1c2ef01753..5dcd2824550d65 100644 --- a/llvm/test/CodeGen/Hexagon/swp-vsum.ll +++ b/llvm/test/CodeGen/Hexagon/swp-vsum.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s -; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s --check-prefix=CHECKV60 +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s -pipeliner-experimental-cg=true | FileCheck %s --check-prefix=CHECKV60 ; Simple vector total. ; CHECK: loop0(.LBB0_[[LOOP:.]], diff --git a/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir index a93af1d54d39dc..c92d35f9239155 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir @@ -44,6 +44,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir index 31efedd3796098..5a0e0309e417f6 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir @@ -44,6 +44,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir b/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir index e3bcac22f13671..8ad50b72c284fc 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir @@ -32,7 +32,7 @@ } ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test1 liveins: @@ -56,14 +56,14 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test2 liveins: @@ -87,14 +87,14 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test3 liveins: @@ -118,13 +118,13 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test4 liveins: @@ -148,8 +148,8 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir b/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir index 005014d5e83f4b..67bf92b60814b6 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir @@ -7,7 +7,7 @@ # CHECK-NEXT: %namedVReg1353:vreg_64 = REG_SEQUENCE %namedVReg4354, %subreg.sub0, %namedVReg1352, %subreg.sub1 # CHECK-NEXT: %namedVReg1354:sgpr_128 = REG_SEQUENCE %namedVReg4354, %subreg.sub0, %namedVReg1352, %subreg.sub1, %namedVReg1358, %subreg.sub2, %namedVReg1359, %subreg.sub3 # This tests for the itereator invalidation fix (reviews.llvm.org/D62713) -# CHECK-NEXT: BUFFER_STORE_DWORD_ADDR64 %namedVReg1352, %namedVReg1353, %namedVReg1354, 0, 0, 0, 0, 0, 0, implicit $exec +# CHECK-NEXT: BUFFER_STORE_DWORD_ADDR64 %namedVReg1352, %namedVReg1353, %namedVReg1354, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... --- name: foo @@ -27,7 +27,7 @@ body: | %vreg123_3:vgpr_32 = COPY %5 %16:sgpr_128 = REG_SEQUENCE killed %vreg123_0, %subreg.sub0, %vreg123_1, %subreg.sub1, %vreg123_2, %subreg.sub2, %vreg123_3, %subreg.sub3 - BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir index 629f7aefd6aff9..c48b13e46e207e 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir @@ -12,7 +12,7 @@ # CHECK: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' # CHECK: scratchWaveOffsetReg: '$sgpr50' # CHECK: frameOffsetReg: '$sgpr50' -# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) +# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) name: reserve_correct_register tracksRegLiveness: true machineFunctionInfo: @@ -25,6 +25,6 @@ stack: body: | bb.0: - renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir index b43705eaf8c0df..1864f1cbec2fab 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir @@ -52,7 +52,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -82,6 +82,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/MSP430/selectcc.ll b/llvm/test/CodeGen/MSP430/selectcc.ll new file mode 100644 index 00000000000000..28b90f0131703e --- /dev/null +++ b/llvm/test/CodeGen/MSP430/selectcc.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=msp430-- < %s | FileCheck %s + +define i16 @select_to_shifts_i16(i16 %a, i16 %b) { +; CHECK-LABEL: select_to_shifts_i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r12, r14 +; CHECK-NEXT: clr r12 +; CHECK-NEXT: bit #2, r14 +; CHECK-NEXT: jeq .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: mov r13, r12 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: ret + %and = and i16 %a, 2 + %tobool = icmp eq i16 %and, 0 + %select = select i1 %tobool, i16 0, i16 %b + ret i16 %select +} + +define i32 @select_to_shifts_i32(i32 %a, i32 %b) { +; CHECK-LABEL: select_to_shifts_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r12, r11 +; CHECK-NEXT: and #2, r11 +; CHECK-NEXT: clr r13 +; CHECK-NEXT: tst r11 +; CHECK-NEXT: clr r12 +; CHECK-NEXT: jne .LBB1_3 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: tst r11 +; CHECK-NEXT: jne .LBB1_4 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: mov r14, r12 +; CHECK-NEXT: tst r11 +; CHECK-NEXT: jeq .LBB1_2 +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: mov r15, r13 +; CHECK-NEXT: ret + %and = and i32 %a, 2 + %tobool = icmp eq i32 %and, 0 + %select = select i1 %tobool, i32 0, i32 %b + ret i32 %select +} diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll index 7881bf861c29d5..4a1f9cb6a6a9d0 100644 --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -50,23 +50,25 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) { ; ; MIPS32R5EB-LABEL: i8_2: ; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: addiu $sp, $sp, -64 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 64 +; MIPS32R5EB-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 ; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: sw $4, 40($sp) -; MIPS32R5EB-NEXT: lbu $1, 37($sp) +; MIPS32R5EB-NEXT: sw $5, 48($sp) +; MIPS32R5EB-NEXT: sw $4, 52($sp) +; MIPS32R5EB-NEXT: lbu $1, 49($sp) ; MIPS32R5EB-NEXT: sw $1, 28($sp) -; MIPS32R5EB-NEXT: lbu $1, 36($sp) +; MIPS32R5EB-NEXT: lbu $1, 48($sp) ; MIPS32R5EB-NEXT: sw $1, 20($sp) -; MIPS32R5EB-NEXT: lbu $1, 41($sp) +; MIPS32R5EB-NEXT: lbu $1, 53($sp) ; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lbu $1, 40($sp) +; MIPS32R5EB-NEXT: lbu $1, 52($sp) ; MIPS32R5EB-NEXT: sw $1, 4($sp) ; MIPS32R5EB-NEXT: ld.d $w0, 16($sp) ; MIPS32R5EB-NEXT: ld.d $w1, 0($sp) @@ -74,12 +76,13 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) { ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 ; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5EB-NEXT: sb $2, 33($sp) -; MIPS32R5EB-NEXT: sb $1, 32($sp) -; MIPS32R5EB-NEXT: lhu $2, 32($sp) +; MIPS32R5EB-NEXT: sb $2, 45($sp) +; MIPS32R5EB-NEXT: sb $1, 44($sp) +; MIPS32R5EB-NEXT: lhu $2, 44($sp) ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 +; MIPS32R5EB-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 64 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; @@ -151,35 +154,38 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) { ; ; MIPS32R5EL-LABEL: i8_2: ; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: addiu $sp, $sp, -64 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 64 +; MIPS32R5EL-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 ; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: sw $4, 40($sp) -; MIPS32R5EL-NEXT: lbu $1, 37($sp) +; MIPS32R5EL-NEXT: sw $5, 48($sp) +; MIPS32R5EL-NEXT: sw $4, 52($sp) +; MIPS32R5EL-NEXT: lbu $1, 49($sp) ; MIPS32R5EL-NEXT: sw $1, 24($sp) -; MIPS32R5EL-NEXT: lbu $1, 36($sp) +; MIPS32R5EL-NEXT: lbu $1, 48($sp) ; MIPS32R5EL-NEXT: sw $1, 16($sp) -; MIPS32R5EL-NEXT: lbu $1, 41($sp) +; MIPS32R5EL-NEXT: lbu $1, 53($sp) ; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lbu $1, 40($sp) +; MIPS32R5EL-NEXT: lbu $1, 52($sp) ; MIPS32R5EL-NEXT: sw $1, 0($sp) ; MIPS32R5EL-NEXT: ld.d $w0, 16($sp) ; MIPS32R5EL-NEXT: ld.d $w1, 0($sp) ; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0 ; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2] -; MIPS32R5EL-NEXT: sb $2, 33($sp) -; MIPS32R5EL-NEXT: sb $1, 32($sp) -; MIPS32R5EL-NEXT: lhu $2, 32($sp) +; MIPS32R5EL-NEXT: sb $2, 45($sp) +; MIPS32R5EL-NEXT: sb $1, 44($sp) +; MIPS32R5EL-NEXT: lhu $2, 44($sp) ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 +; MIPS32R5EL-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 64 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop ; @@ -312,36 +318,38 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x ; MIPS32R5EB: # %bb.0: # %entry ; MIPS32R5EB-NEXT: addiu $sp, $sp, -144 ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 144 -; MIPS32R5EB-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: sw $ra, 140($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 136($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 ; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $5, 132($sp) -; MIPS32R5EB-NEXT: sw $4, 136($sp) -; MIPS32R5EB-NEXT: lbu $1, 133($sp) +; MIPS32R5EB-NEXT: sw $5, 128($sp) +; MIPS32R5EB-NEXT: sw $4, 132($sp) +; MIPS32R5EB-NEXT: lbu $1, 129($sp) ; MIPS32R5EB-NEXT: sw $1, 76($sp) -; MIPS32R5EB-NEXT: lbu $1, 132($sp) +; MIPS32R5EB-NEXT: lbu $1, 128($sp) ; MIPS32R5EB-NEXT: sw $1, 68($sp) -; MIPS32R5EB-NEXT: lbu $1, 137($sp) +; MIPS32R5EB-NEXT: lbu $1, 133($sp) ; MIPS32R5EB-NEXT: sw $1, 60($sp) -; MIPS32R5EB-NEXT: lbu $1, 136($sp) +; MIPS32R5EB-NEXT: lbu $1, 132($sp) ; MIPS32R5EB-NEXT: sw $1, 52($sp) ; MIPS32R5EB-NEXT: ld.d $w0, 64($sp) ; MIPS32R5EB-NEXT: ld.d $w1, 48($sp) ; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0 -; MIPS32R5EB-NEXT: sw $6, 128($sp) -; MIPS32R5EB-NEXT: lbu $1, 129($sp) +; MIPS32R5EB-NEXT: sw $6, 124($sp) +; MIPS32R5EB-NEXT: lbu $1, 125($sp) ; MIPS32R5EB-NEXT: sw $1, 92($sp) -; MIPS32R5EB-NEXT: lbu $1, 128($sp) +; MIPS32R5EB-NEXT: lbu $1, 124($sp) ; MIPS32R5EB-NEXT: sw $1, 84($sp) ; MIPS32R5EB-NEXT: ld.d $w1, 80($sp) ; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 -; MIPS32R5EB-NEXT: sw $7, 124($sp) -; MIPS32R5EB-NEXT: lbu $1, 125($sp) +; MIPS32R5EB-NEXT: sw $7, 120($sp) +; MIPS32R5EB-NEXT: lbu $1, 121($sp) ; MIPS32R5EB-NEXT: sw $1, 108($sp) -; MIPS32R5EB-NEXT: lbu $1, 124($sp) +; MIPS32R5EB-NEXT: lbu $1, 120($sp) ; MIPS32R5EB-NEXT: sw $1, 100($sp) ; MIPS32R5EB-NEXT: ld.d $w1, 96($sp) ; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1 @@ -366,11 +374,12 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 ; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5EB-NEXT: sb $2, 121($sp) -; MIPS32R5EB-NEXT: sb $1, 120($sp) -; MIPS32R5EB-NEXT: lhu $2, 120($sp) +; MIPS32R5EB-NEXT: sb $2, 117($sp) +; MIPS32R5EB-NEXT: sb $1, 116($sp) +; MIPS32R5EB-NEXT: lhu $2, 116($sp) ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $fp, 136($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 140($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 144 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop @@ -550,36 +559,38 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x ; MIPS32R5EL: # %bb.0: # %entry ; MIPS32R5EL-NEXT: addiu $sp, $sp, -144 ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 144 -; MIPS32R5EL-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: sw $ra, 140($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 136($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 ; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $5, 132($sp) -; MIPS32R5EL-NEXT: sw $4, 136($sp) -; MIPS32R5EL-NEXT: lbu $1, 133($sp) +; MIPS32R5EL-NEXT: sw $5, 128($sp) +; MIPS32R5EL-NEXT: sw $4, 132($sp) +; MIPS32R5EL-NEXT: lbu $1, 129($sp) ; MIPS32R5EL-NEXT: sw $1, 72($sp) -; MIPS32R5EL-NEXT: lbu $1, 132($sp) +; MIPS32R5EL-NEXT: lbu $1, 128($sp) ; MIPS32R5EL-NEXT: sw $1, 64($sp) -; MIPS32R5EL-NEXT: lbu $1, 137($sp) +; MIPS32R5EL-NEXT: lbu $1, 133($sp) ; MIPS32R5EL-NEXT: sw $1, 56($sp) -; MIPS32R5EL-NEXT: lbu $1, 136($sp) +; MIPS32R5EL-NEXT: lbu $1, 132($sp) ; MIPS32R5EL-NEXT: sw $1, 48($sp) ; MIPS32R5EL-NEXT: ld.d $w0, 64($sp) ; MIPS32R5EL-NEXT: ld.d $w1, 48($sp) ; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0 -; MIPS32R5EL-NEXT: sw $6, 128($sp) -; MIPS32R5EL-NEXT: lbu $1, 129($sp) +; MIPS32R5EL-NEXT: sw $6, 124($sp) +; MIPS32R5EL-NEXT: lbu $1, 125($sp) ; MIPS32R5EL-NEXT: sw $1, 88($sp) -; MIPS32R5EL-NEXT: lbu $1, 128($sp) +; MIPS32R5EL-NEXT: lbu $1, 124($sp) ; MIPS32R5EL-NEXT: sw $1, 80($sp) ; MIPS32R5EL-NEXT: ld.d $w1, 80($sp) ; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 -; MIPS32R5EL-NEXT: sw $7, 124($sp) -; MIPS32R5EL-NEXT: lbu $1, 125($sp) +; MIPS32R5EL-NEXT: sw $7, 120($sp) +; MIPS32R5EL-NEXT: lbu $1, 121($sp) ; MIPS32R5EL-NEXT: sw $1, 104($sp) -; MIPS32R5EL-NEXT: lbu $1, 124($sp) +; MIPS32R5EL-NEXT: lbu $1, 120($sp) ; MIPS32R5EL-NEXT: sw $1, 96($sp) ; MIPS32R5EL-NEXT: ld.d $w1, 96($sp) ; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 @@ -603,11 +614,12 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x ; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1 ; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2] -; MIPS32R5EL-NEXT: sb $2, 121($sp) -; MIPS32R5EL-NEXT: sb $1, 120($sp) -; MIPS32R5EL-NEXT: lhu $2, 120($sp) +; MIPS32R5EL-NEXT: sb $2, 117($sp) +; MIPS32R5EL-NEXT: sb $1, 116($sp) +; MIPS32R5EL-NEXT: lhu $2, 116($sp) ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $fp, 136($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 140($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 144 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop @@ -952,8 +964,10 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) { ; MIPS32R5EB: # %bb.0: ; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 @@ -1019,7 +1033,8 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) { ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop @@ -1088,8 +1103,10 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) { ; MIPS32R5EL: # %bb.0: ; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 @@ -1155,7 +1172,8 @@ define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) { ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop @@ -1471,23 +1489,25 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) { ; ; MIPS32R5EB-LABEL: i16_2: ; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: addiu $sp, $sp, -64 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 64 +; MIPS32R5EB-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 ; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: sw $4, 40($sp) -; MIPS32R5EB-NEXT: lhu $1, 38($sp) +; MIPS32R5EB-NEXT: sw $5, 48($sp) +; MIPS32R5EB-NEXT: sw $4, 52($sp) +; MIPS32R5EB-NEXT: lhu $1, 50($sp) ; MIPS32R5EB-NEXT: sw $1, 28($sp) -; MIPS32R5EB-NEXT: lhu $1, 36($sp) +; MIPS32R5EB-NEXT: lhu $1, 48($sp) ; MIPS32R5EB-NEXT: sw $1, 20($sp) -; MIPS32R5EB-NEXT: lhu $1, 42($sp) +; MIPS32R5EB-NEXT: lhu $1, 54($sp) ; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lhu $1, 40($sp) +; MIPS32R5EB-NEXT: lhu $1, 52($sp) ; MIPS32R5EB-NEXT: sw $1, 4($sp) ; MIPS32R5EB-NEXT: ld.d $w0, 16($sp) ; MIPS32R5EB-NEXT: ld.d $w1, 0($sp) @@ -1495,12 +1515,13 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) { ; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177 ; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5EB-NEXT: sh $2, 34($sp) -; MIPS32R5EB-NEXT: sh $1, 32($sp) -; MIPS32R5EB-NEXT: lw $2, 32($sp) +; MIPS32R5EB-NEXT: sh $2, 46($sp) +; MIPS32R5EB-NEXT: sh $1, 44($sp) +; MIPS32R5EB-NEXT: lw $2, 44($sp) ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 +; MIPS32R5EB-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 64 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; @@ -1532,35 +1553,38 @@ define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) { ; ; MIPS32R5EL-LABEL: i16_2: ; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: addiu $sp, $sp, -64 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 64 +; MIPS32R5EL-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 ; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: sw $4, 40($sp) -; MIPS32R5EL-NEXT: lhu $1, 38($sp) +; MIPS32R5EL-NEXT: sw $5, 48($sp) +; MIPS32R5EL-NEXT: sw $4, 52($sp) +; MIPS32R5EL-NEXT: lhu $1, 50($sp) ; MIPS32R5EL-NEXT: sw $1, 24($sp) -; MIPS32R5EL-NEXT: lhu $1, 36($sp) +; MIPS32R5EL-NEXT: lhu $1, 48($sp) ; MIPS32R5EL-NEXT: sw $1, 16($sp) -; MIPS32R5EL-NEXT: lhu $1, 42($sp) +; MIPS32R5EL-NEXT: lhu $1, 54($sp) ; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lhu $1, 40($sp) +; MIPS32R5EL-NEXT: lhu $1, 52($sp) ; MIPS32R5EL-NEXT: sw $1, 0($sp) ; MIPS32R5EL-NEXT: ld.d $w0, 16($sp) ; MIPS32R5EL-NEXT: ld.d $w1, 0($sp) ; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0 ; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2] -; MIPS32R5EL-NEXT: sh $2, 34($sp) -; MIPS32R5EL-NEXT: sh $1, 32($sp) -; MIPS32R5EL-NEXT: lw $2, 32($sp) +; MIPS32R5EL-NEXT: sh $2, 46($sp) +; MIPS32R5EL-NEXT: sh $1, 44($sp) +; MIPS32R5EL-NEXT: lw $2, 44($sp) ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 +; MIPS32R5EL-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 64 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop %1 = add <2 x i16> %a, %b @@ -1622,8 +1646,10 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) { ; MIPS32R5EB: # %bb.0: ; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 @@ -1665,7 +1691,8 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) { ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop @@ -1710,8 +1737,10 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) { ; MIPS32R5EL: # %bb.0: ; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 @@ -1753,7 +1782,8 @@ define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) { ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop @@ -1962,8 +1992,10 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS32R5EB: # %bb.0: ; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 @@ -1979,7 +2011,8 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop @@ -2010,8 +2043,10 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS32R5EL: # %bb.0: ; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 @@ -2026,7 +2061,8 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop @@ -2312,8 +2348,10 @@ define void @float_2(<2 x float> %a, <2 x float> %b) { ; MIPS32R5: # %bb.0: ; MIPS32R5-NEXT: addiu $sp, $sp, -48 ; MIPS32R5-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 30, -4 +; MIPS32R5-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill +; MIPS32R5-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill +; MIPS32R5-NEXT: .cfi_offset 31, -4 +; MIPS32R5-NEXT: .cfi_offset 30, -8 ; MIPS32R5-NEXT: move $fp, $sp ; MIPS32R5-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5-NEXT: addiu $1, $zero, -16 @@ -2331,7 +2369,8 @@ define void @float_2(<2 x float> %a, <2 x float> %b) { ; MIPS32R5-NEXT: swc1 $f1, 4($2) ; MIPS32R5-NEXT: swc1 $f0, %lo(float_res_v2f32)($1) ; MIPS32R5-NEXT: move $sp, $fp -; MIPS32R5-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload +; MIPS32R5-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload +; MIPS32R5-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload ; MIPS32R5-NEXT: addiu $sp, $sp, 48 ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop @@ -2794,8 +2833,10 @@ define <8 x i8> @ret_8_i8() { ; MIPS32R5EB: # %bb.0: ; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $fp, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 @@ -2810,7 +2851,8 @@ define <8 x i8> @ret_8_i8() { ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop @@ -2829,8 +2871,10 @@ define <8 x i8> @ret_8_i8() { ; MIPS32R5EL: # %bb.0: ; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $fp, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 @@ -2845,7 +2889,8 @@ define <8 x i8> @ret_8_i8() { ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop @@ -2965,8 +3010,10 @@ define <4 x i16> @ret_4_i16() { ; MIPS32R5EB: # %bb.0: ; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $fp, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 @@ -2981,7 +3028,8 @@ define <4 x i16> @ret_4_i16() { ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop @@ -3000,8 +3048,10 @@ define <4 x i16> @ret_4_i16() { ; MIPS32R5EL: # %bb.0: ; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $fp, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 @@ -3016,7 +3066,8 @@ define <4 x i16> @ret_4_i16() { ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop @@ -3098,8 +3149,10 @@ define <2 x i32> @ret_2_i32() { ; MIPS32R5EB: # %bb.0: ; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $fp, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 30, -4 +; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 ; MIPS32R5EB-NEXT: move $fp, $sp ; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EB-NEXT: addiu $1, $zero, -16 @@ -3114,7 +3167,8 @@ define <2 x i32> @ret_2_i32() { ; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] ; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] ; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop @@ -3133,8 +3187,10 @@ define <2 x i32> @ret_2_i32() { ; MIPS32R5EL: # %bb.0: ; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $fp, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 30, -4 +; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 ; MIPS32R5EL-NEXT: move $fp, $sp ; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5EL-NEXT: addiu $1, $zero, -16 @@ -3149,7 +3205,8 @@ define <2 x i32> @ret_2_i32() { ; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] ; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] ; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop @@ -6073,8 +6130,10 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) { ; MIPS32R5: # %bb.0: # %entry ; MIPS32R5-NEXT: addiu $sp, $sp, -64 ; MIPS32R5-NEXT: .cfi_def_cfa_offset 64 -; MIPS32R5-NEXT: sw $fp, 60($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 30, -4 +; MIPS32R5-NEXT: sw $ra, 60($sp) # 4-byte Folded Spill +; MIPS32R5-NEXT: sw $fp, 56($sp) # 4-byte Folded Spill +; MIPS32R5-NEXT: .cfi_offset 31, -4 +; MIPS32R5-NEXT: .cfi_offset 30, -8 ; MIPS32R5-NEXT: move $fp, $sp ; MIPS32R5-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5-NEXT: addiu $1, $zero, -16 @@ -6098,7 +6157,8 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) { ; MIPS32R5-NEXT: splati.w $w1, $w0[1] ; MIPS32R5-NEXT: add.s $f0, $f0, $f1 ; MIPS32R5-NEXT: move $sp, $fp -; MIPS32R5-NEXT: lw $fp, 60($sp) # 4-byte Folded Reload +; MIPS32R5-NEXT: lw $fp, 56($sp) # 4-byte Folded Reload +; MIPS32R5-NEXT: lw $ra, 60($sp) # 4-byte Folded Reload ; MIPS32R5-NEXT: addiu $sp, $sp, 64 ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop diff --git a/llvm/test/CodeGen/Mips/dynamic-stack-realignment.ll b/llvm/test/CodeGen/Mips/dynamic-stack-realignment.ll index 5054b9cd02f478..50acd8ffb1eb19 100644 --- a/llvm/test/CodeGen/Mips/dynamic-stack-realignment.ll +++ b/llvm/test/CodeGen/Mips/dynamic-stack-realignment.ll @@ -163,8 +163,9 @@ entry: ; GP32-M: addiu $sp, $sp, -1024 ; GP32-MMR2: addiusp -1024 ; GP32-MMR6: addiu $sp, $sp, -1024 - ; GP32: sw $fp, 1020($sp) - ; GP32: sw $23, 1016($sp) + ; GP32: sw $ra, 1020($sp) + ; GP32: sw $fp, 1016($sp) + ; GP32: sw $23, 1012($sp) ; ; GP32: move $fp, $sp ; GP32: addiu $[[T0:[0-9]+|gp]], $zero, -512 @@ -177,8 +178,9 @@ entry: ; epilogue ; GP32: move $sp, $fp - ; GP32: lw $23, 1016($sp) - ; GP32: lw $fp, 1020($sp) + ; GP32: lw $23, 1012($sp) + ; GP32: lw $fp, 1016($sp) + ; GP32: lw $ra, 1020($sp) ; GP32-M: addiu $sp, $sp, 1024 ; GP32-MMR2: addiusp 1024 ; GP32-MMR6: addiu $sp, $sp, 1024 @@ -201,8 +203,9 @@ entry: ; FIXME: We are currently over-allocating stack space. ; N32: addiu $sp, $sp, -1024 ; N64: daddiu $sp, $sp, -1024 - ; GP64: sd $fp, 1016($sp) - ; GP64: sd $23, 1008($sp) + ; GP64: sd $ra, 1016($sp) + ; GP64: sd $fp, 1008($sp) + ; GP64: sd $23, 1000($sp) ; ; GP64: move $fp, $sp ; GP64: addiu $[[T0:[0-9]+|gp]], $zero, -512 @@ -215,8 +218,9 @@ entry: ; epilogue ; GP64: move $sp, $fp - ; GP64: ld $23, 1008($sp) - ; GP64: ld $fp, 1016($sp) + ; GP64: ld $23, 1000($sp) + ; GP64: ld $fp, 1008($sp) + ; GP64: ld $ra, 1016($sp) ; N32: addiu $sp, $sp, 1024 ; N64: daddiu $sp, $sp, 1024 diff --git a/llvm/test/CodeGen/Mips/frame-address.ll b/llvm/test/CodeGen/Mips/frame-address.ll index 0ab7da30e785f8..f7ceb575c65ae9 100644 --- a/llvm/test/CodeGen/Mips/frame-address.ll +++ b/llvm/test/CodeGen/Mips/frame-address.ll @@ -1,17 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=mipsel < %s | FileCheck %s declare i8* @llvm.frameaddress(i32) nounwind readnone define i8* @f() nounwind uwtable { +; CHECK-LABEL: f: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addiu $sp, $sp, -8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: sw $ra, 4($sp) # 4-byte Folded Spill +; CHECK-NEXT: sw $fp, 0($sp) # 4-byte Folded Spill +; CHECK-NEXT: .cfi_offset 31, -4 +; CHECK-NEXT: .cfi_offset 30, -8 +; CHECK-NEXT: move $fp, $sp +; CHECK-NEXT: .cfi_def_cfa_register 30 +; CHECK-NEXT: move $2, $fp +; CHECK-NEXT: move $sp, $fp +; CHECK-NEXT: lw $fp, 0($sp) # 4-byte Folded Reload +; CHECK-NEXT: lw $ra, 4($sp) # 4-byte Folded Reload +; CHECK-NEXT: jr $ra +; CHECK-NEXT: addiu $sp, $sp, 8 entry: %0 = call i8* @llvm.frameaddress(i32 0) ret i8* %0 - -; CHECK: .cfi_startproc -; CHECK: .cfi_def_cfa_offset 8 -; CHECK: .cfi_offset 30, -4 -; CHECK: move $fp, $sp -; CHECK: .cfi_def_cfa_register 30 -; CHECK: move $2, $fp -; CHECK: .cfi_endproc } diff --git a/llvm/test/CodeGen/Mips/micromips-ase-function-attribute.ll b/llvm/test/CodeGen/Mips/micromips-ase-function-attribute.ll index fe82b7c5b6cf08..cd78166d372a68 100644 --- a/llvm/test/CodeGen/Mips/micromips-ase-function-attribute.ll +++ b/llvm/test/CodeGen/Mips/micromips-ase-function-attribute.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=mips-unknown-linux -filetype=obj %s -o - | \ -; RUN: llvm-readobj --mips-abi-flags | \ +; RUN: llvm-readobj -A | \ ; RUN: FileCheck --check-prefix=ASE-MICROMIPS %s define void @_Z3foov() #0 { diff --git a/llvm/test/CodeGen/Mips/no-frame-pointer-elim.ll b/llvm/test/CodeGen/Mips/no-frame-pointer-elim.ll new file mode 100644 index 00000000000000..5242ff4612976b --- /dev/null +++ b/llvm/test/CodeGen/Mips/no-frame-pointer-elim.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=mips64 -relocation-model=static < %s \ +; RUN: | FileCheck %s --check-prefix STATIC +; RUN: llc -march=mips64 -relocation-model=pic < %s \ +; RUN: | FileCheck %s --check-prefix PIC + +declare dso_local void @callee() noreturn nounwind + +define dso_local void @caller() nounwind "no-frame-pointer-elim-non-leaf" { +; STATIC-LABEL: caller: +; STATIC: # %bb.0: # %entry +; STATIC-NEXT: daddiu $sp, $sp, -16 +; STATIC-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; STATIC-NEXT: sd $fp, 0($sp) # 8-byte Folded Spill +; STATIC-NEXT: move $fp, $sp +; STATIC-NEXT: jal callee +; STATIC-NEXT: nop +; +; PIC-LABEL: caller: +; PIC: # %bb.0: # %entry +; PIC-NEXT: daddiu $sp, $sp, -32 +; PIC-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill +; PIC-NEXT: sd $fp, 16($sp) # 8-byte Folded Spill +; PIC-NEXT: sd $gp, 8($sp) # 8-byte Folded Spill +; PIC-NEXT: move $fp, $sp +; PIC-NEXT: lui $1, %hi(%neg(%gp_rel(caller))) +; PIC-NEXT: daddu $1, $1, $25 +; PIC-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(caller))) +; PIC-NEXT: ld $25, %call16(callee)($gp) +; PIC-NEXT: .reloc .Ltmp0, R_MIPS_JALR, callee +; PIC-NEXT: .Ltmp0: +; PIC-NEXT: jalr $25 +; PIC-NEXT: nop +entry: + tail call void @callee() + unreachable +} diff --git a/llvm/test/CodeGen/Mips/tnaked.ll b/llvm/test/CodeGen/Mips/tnaked.ll index 7dff19c5d0009c..e88396bac50807 100644 --- a/llvm/test/CodeGen/Mips/tnaked.ll +++ b/llvm/test/CodeGen/Mips/tnaked.ll @@ -21,7 +21,7 @@ entry: ; CHECK: .ent tnonaked ; CHECK-LABEL: tnonaked: ; CHECK: .frame $fp,8,$ra -; CHECK: .mask 0x40000000,-4 +; CHECK: .mask 0xc0000000,-4 ; CHECK: .fmask 0x00000000,0 ; CHECK: addiu $sp, $sp, -8 diff --git a/llvm/test/CodeGen/Mips/v2i16tof32.ll b/llvm/test/CodeGen/Mips/v2i16tof32.ll index 334413b03d58d0..ab9fa9eefd413b 100644 --- a/llvm/test/CodeGen/Mips/v2i16tof32.ll +++ b/llvm/test/CodeGen/Mips/v2i16tof32.ll @@ -9,8 +9,10 @@ define float @f(<8 x i16>* %a) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addiu $sp, $sp, -32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sw $fp, 28($sp) # 4-byte Folded Spill -; CHECK-NEXT: .cfi_offset 30, -4 +; CHECK-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; CHECK-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; CHECK-NEXT: .cfi_offset 31, -4 +; CHECK-NEXT: .cfi_offset 30, -8 ; CHECK-NEXT: move $fp, $sp ; CHECK-NEXT: .cfi_def_cfa_register 30 ; CHECK-NEXT: addiu $1, $zero, -16 @@ -25,7 +27,8 @@ define float @f(<8 x i16>* %a) { ; CHECK-NEXT: sw $1, 4($sp) ; CHECK-NEXT: mtc1 $2, $f0 ; CHECK-NEXT: move $sp, $fp -; CHECK-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload +; CHECK-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; CHECK-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload ; CHECK-NEXT: jr $ra ; CHECK-NEXT: addiu $sp, $sp, 32 entry: diff --git a/llvm/test/CodeGen/PowerPC/block-placement.mir b/llvm/test/CodeGen/PowerPC/block-placement.mir index 54bd9b8e92393e..9dc911f785b5ee 100644 --- a/llvm/test/CodeGen/PowerPC/block-placement.mir +++ b/llvm/test/CodeGen/PowerPC/block-placement.mir @@ -209,10 +209,14 @@ body: | BLR8 implicit $lr8, implicit $rm, implicit killed $x3 ; CHECK: bb.5.if.else.i: - ; CHECK-NEXT: renamable $x3 = LI8 1 - ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + ; CHECK: successors: %bb.11(0x80000000) + ; CHECK: B %bb.11 ; CHECK: bb.8.while.body.i (align 16): - ; CHECK: successors: %bb.5(0x04000000), %bb.9(0x7c000000) - ; CHECK: BCC 76, killed renamable $cr0, %bb.5 + ; CHECK: successors: %bb.11(0x04000000), %bb.9(0x7c000000) + ; CHECK: BCC 76, killed renamable $cr0, %bb.11 + + ; CHECK: bb.11: + ; CHECK: renamable $x3 = LI8 1 + ; CHECK-NEXT: BLR8 implicit $lr8, implicit $rm, implicit killed $x3 ... diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll index 864f723575f4b7..1d85f4f9680ae1 100644 --- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll +++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll @@ -14,7 +14,7 @@ define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) { ; CHECK-NEXT: xvredp 2, 0 ; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l ; CHECK-NEXT: xxswapd 1, 1 -; CHECK-NEXT: xvnmsubadp 1, 2, 0 +; CHECK-NEXT: xvnmsubadp 1, 0, 2 ; CHECK-NEXT: xvmaddadp 2, 2, 1 ; CHECK-NEXT: lxvd2x 1, 0, 3 ; CHECK-NEXT: xxswapd 1, 1 diff --git a/llvm/test/CodeGen/PowerPC/f128-fma.ll b/llvm/test/CodeGen/PowerPC/f128-fma.ll index 9c3c062e4cc7d9..8bd2ffe03e58c1 100644 --- a/llvm/test/CodeGen/PowerPC/f128-fma.ll +++ b/llvm/test/CodeGen/PowerPC/f128-fma.ll @@ -129,7 +129,7 @@ entry: ; CHECK-DAG: lxv v[[REG3:[0-9]+]], 0(r3) ; CHECK-DAG: lxv v[[REG4:[0-9]+]], 0(r4) ; CHECK-DAG: lxv v[[REG5:[0-9]+]], 0(r5) -; CHECK: xsnmsubqp v[[REG3]], v[[REG5]], v[[REG4]] +; CHECK: xsnmsubqp v[[REG3]], v[[REG4]], v[[REG5]] ; CHECK-NEXT: stxv v[[REG3]], 0(r6) ; CHECK-NEXT: blr } diff --git a/llvm/test/CodeGen/PowerPC/fma-assoc.ll b/llvm/test/CodeGen/PowerPC/fma-assoc.ll index ffa3c22fd83021..9bca280015d528 100644 --- a/llvm/test/CodeGen/PowerPC/fma-assoc.ll +++ b/llvm/test/CodeGen/PowerPC/fma-assoc.ll @@ -260,7 +260,6 @@ define double @test_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, ; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT3: ; CHECK-VSX: xsnmsubmdp ; CHECK-VSX-NEXT: xsnmsubadp -; CHECK-VSX-NEXT: fmr ; CHECK-VSX-NEXT: blr } diff --git a/llvm/test/CodeGen/PowerPC/fma-ext.ll b/llvm/test/CodeGen/PowerPC/fma-ext.ll index 57771d3a453c56..f8ed49f9ca7c8b 100644 --- a/llvm/test/CodeGen/PowerPC/fma-ext.ll +++ b/llvm/test/CodeGen/PowerPC/fma-ext.ll @@ -54,7 +54,6 @@ define double @test_FMSUB_EXT2(float %A, float %B, double %C) { ; CHECK-VSX-LABEL: test_FMSUB_EXT2: ; CHECK-VSX: xsnmsubmdp -; CHECK-VSX-NEXT: fmr ; CHECK-VSX-NEXT: blr } diff --git a/llvm/test/CodeGen/PowerPC/fp-intrinsics-fptosi-legal.ll b/llvm/test/CodeGen/PowerPC/fp-intrinsics-fptosi-legal.ll index 98f2f36db745a2..dff47786e387fe 100644 --- a/llvm/test/CodeGen/PowerPC/fp-intrinsics-fptosi-legal.ll +++ b/llvm/test/CodeGen/PowerPC/fp-intrinsics-fptosi-legal.ll @@ -8,10 +8,11 @@ ; Verify that no gross errors happen. ; CHECK-LABEL: @f20 ; COMMON: cfdctsiz -define i32 @f20(double %a) { +define i32 @f20(double %a) strictfp { entry: %result = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double 42.1, metadata !"fpexcept.strict") + strictfp ret i32 %result } diff --git a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll index bb30a77376e794..9ab320cd1eacfe 100644 --- a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll +++ b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll @@ -13,7 +13,7 @@ define <4 x float> @repeated_fp_divisor(float %a, <4 x float> %b) { ; CHECK-NEXT: lvx 4, 0, 3 ; CHECK-NEXT: xxspltw 0, 0, 0 ; CHECK-NEXT: xvresp 1, 0 -; CHECK-NEXT: xvnmsubasp 35, 1, 0 +; CHECK-NEXT: xvnmsubasp 35, 0, 1 ; CHECK-NEXT: xvmulsp 0, 34, 36 ; CHECK-NEXT: xvmaddasp 1, 1, 35 ; CHECK-NEXT: xvmulsp 34, 0, 1 diff --git a/llvm/test/CodeGen/PowerPC/sh-overflow.mir b/llvm/test/CodeGen/PowerPC/sh-overflow.mir new file mode 100644 index 00000000000000..31cd710c39ea00 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/sh-overflow.mir @@ -0,0 +1,58 @@ +# RUN: llc -O3 -mtriple=powerpc64le-unknown-linux-gnu -start-after ppc-mi-peepholes -ppc-late-peephole -ppc-asm-full-reg-names -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: special_right_shift32_0 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: gprc } + - { id: 1, class: gprc } + - { id: 2, class: gprc } +liveins: + - { reg: '$r3', virtual-reg: '%0' } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $r3 + + ; Ensure we do not attempt to transform this into srwi $r3, $r3, 0 in the + ; form specified by ISA 3.0b (rlwinm $r3, $r3, 32 - 0, 0, 31) + + ; CHECK-LABEL: special_right_shift32_0: + ; CHECK: slwi r[[#]], r[[#]], 0 + + %0:gprc = COPY killed $r3 + %1:gprc = LI 0 + %2:gprc = SRW killed %0, killed %1 + $r3 = COPY killed %2 + BLR implicit $lr, implicit $rm, implicit killed $r3 + +... +--- +name: special_right_shift64_0 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc } + - { id: 1, class: gprc } + - { id: 2, class: g8rc } +liveins: + - { reg: '$x3', virtual-reg: '%0' } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x3 + + ; Ensure we do not attempt to transform this into srdi $r3, $r3, 0 in the + ; form specified by ISA 3.0b (rldicl $r3, $r3, 64 - 0, 0) + + ; CHECK-LABEL: special_right_shift64_0: + ; CHECK: rotldi r[[#]], r[[#]], 0 + + %0:g8rc = COPY killed $x3 + %1:gprc = LI 0 + %2:g8rc = SRD killed %0, killed %1 + $x3 = COPY killed %2 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + +... diff --git a/llvm/test/CodeGen/PowerPC/srem-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-lkk.ll new file mode 100644 index 00000000000000..ccd1b612235af1 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/srem-lkk.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck -check-prefixes=CHECK,CHECK64 %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck -check-prefixes=CHECK,CHECK32 %s + +define i32 @fold_srem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -21386 +; CHECK-NEXT: ori 4, 4, 37253 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: add 4, 4, 3 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 6 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, 95 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; CHECK-LABEL: fold_srem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, 15827 +; CHECK-NEXT: ori 4, 4, 36849 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 8 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, 1060 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -23206 +; CHECK-NEXT: ori 4, 4, 65445 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 8 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, -723 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -731 +; CHECK-NEXT: ori 4, 4, 62439 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 8 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 4, 4, -22981 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -21386 +; CHECK-NEXT: ori 4, 4, 37253 +; CHECK-NEXT: mulhw 4, 3, 4 +; CHECK-NEXT: add 4, 4, 3 +; CHECK-NEXT: srwi 5, 4, 31 +; CHECK-NEXT: srawi 4, 4, 6 +; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: mulli 5, 4, 95 +; CHECK-NEXT: subf 3, 5, 3 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: blr + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: srawi 4, 3, 6 +; CHECK-NEXT: addze 4, 4 +; CHECK-NEXT: slwi 4, 4, 6 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: blr + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: srawi 4, 3, 31 +; CHECK-NEXT: addze 4, 4 +; CHECK-NEXT: slwi 4, 4, 31 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: blr + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stw 0, 4(1) +; CHECK-NEXT: stwu 1, -16(1) +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset lr, 4 +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: li 6, 98 +; CHECK-NEXT: bl __moddi3@PLT +; CHECK-NEXT: lwz 0, 20(1) +; CHECK-NEXT: addi 1, 1, 16 +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr + %1 = srem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll new file mode 100644 index 00000000000000..d795f6b62fab3e --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -0,0 +1,1675 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8BE + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; P9LE-LABEL: fold_srem_vec_1: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 31710 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 63421 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r4, r5 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 21399 +; P9LE-NEXT: mulli r4, r4, -124 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 33437 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 5 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, -16728 +; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 63249 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 8 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, -1003 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_srem_vec_1: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, 31710 +; P9BE-NEXT: ori r4, r4, 63421 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r4, r3, r4 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, -124 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -16728 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 63249 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 8 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, -1003 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, 21399 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 33437 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 5 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 98 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_srem_vec_1: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r4, 21399 +; P8LE-NEXT: lis r9, -16728 +; P8LE-NEXT: lis r11, -21386 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r4, r4, 33437 +; P8LE-NEXT: ori r9, r9, 63249 +; P8LE-NEXT: ori r11, r11, 37253 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: rldicl r3, r5, 32, 48 +; P8LE-NEXT: rldicl r6, r5, 16, 48 +; P8LE-NEXT: clrldi r7, r5, 48 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: extsh r10, r6 +; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsh r12, r7 +; P8LE-NEXT: extsw r10, r10 +; P8LE-NEXT: mulld r4, r8, r4 +; P8LE-NEXT: lis r8, 31710 +; P8LE-NEXT: extsh r0, r5 +; P8LE-NEXT: extsw r12, r12 +; P8LE-NEXT: mulld r9, r10, r9 +; P8LE-NEXT: ori r8, r8, 63421 +; P8LE-NEXT: extsw r10, r0 +; P8LE-NEXT: mulld r11, r12, r11 +; P8LE-NEXT: mulld r8, r10, r8 +; P8LE-NEXT: rldicl r0, r4, 1, 63 +; P8LE-NEXT: rldicl r4, r4, 32, 32 +; P8LE-NEXT: rldicl r30, r9, 1, 63 +; P8LE-NEXT: rldicl r9, r9, 32, 32 +; P8LE-NEXT: rldicl r11, r11, 32, 32 +; P8LE-NEXT: rldicl r8, r8, 32, 32 +; P8LE-NEXT: add r11, r11, r12 +; P8LE-NEXT: srawi r4, r4, 5 +; P8LE-NEXT: subf r8, r10, r8 +; P8LE-NEXT: srawi r9, r9, 8 +; P8LE-NEXT: srwi r10, r11, 31 +; P8LE-NEXT: add r4, r4, r0 +; P8LE-NEXT: srawi r11, r11, 6 +; P8LE-NEXT: add r9, r9, r30 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: add r10, r11, r10 +; P8LE-NEXT: srwi r11, r8, 31 +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: mulli r4, r4, 98 +; P8LE-NEXT: mulli r9, r9, -1003 +; P8LE-NEXT: add r8, r8, r11 +; P8LE-NEXT: mulli r10, r10, 95 +; P8LE-NEXT: mulli r8, r8, -124 +; P8LE-NEXT: subf r3, r4, r3 +; P8LE-NEXT: subf r4, r9, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: subf r4, r8, r5 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_srem_vec_1: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, -16728 +; P8BE-NEXT: lis r9, 31710 +; P8BE-NEXT: lis r8, 21399 +; P8BE-NEXT: lis r10, -21386 +; P8BE-NEXT: ori r3, r3, 63249 +; P8BE-NEXT: ori r9, r9, 63421 +; P8BE-NEXT: ori r8, r8, 33437 +; P8BE-NEXT: ori r10, r10, 37253 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r9, r7, r9 +; P8BE-NEXT: mulld r8, r6, r8 +; P8BE-NEXT: mulld r10, r4, r10 +; P8BE-NEXT: rldicl r11, r3, 1, 63 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rldicl r9, r9, 32, 32 +; P8BE-NEXT: rldicl r12, r8, 1, 63 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: rldicl r10, r10, 32, 32 +; P8BE-NEXT: subf r9, r7, r9 +; P8BE-NEXT: srawi r3, r3, 8 +; P8BE-NEXT: srawi r8, r8, 5 +; P8BE-NEXT: add r10, r10, r4 +; P8BE-NEXT: add r3, r3, r11 +; P8BE-NEXT: srwi r11, r9, 31 +; P8BE-NEXT: add r8, r8, r12 +; P8BE-NEXT: srawi r9, r9, 6 +; P8BE-NEXT: mulli r3, r3, -1003 +; P8BE-NEXT: add r9, r9, r11 +; P8BE-NEXT: srwi r11, r10, 31 +; P8BE-NEXT: srawi r10, r10, 6 +; P8BE-NEXT: mulli r8, r8, 98 +; P8BE-NEXT: add r10, r10, r11 +; P8BE-NEXT: mulli r9, r9, -124 +; P8BE-NEXT: mulli r10, r10, 95 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: subf r5, r8, r6 +; P8BE-NEXT: mtvsrd v2, r3 +; P8BE-NEXT: subf r6, r9, r7 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: subf r4, r10, r4 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r6, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; P9LE-LABEL: fold_srem_vec_2: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_srem_vec_2: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r6, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r6, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r6, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_srem_vec_2: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r4, -21386 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r4, r4, 37253 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: clrldi r3, r5, 48 +; P8LE-NEXT: rldicl r7, r5, 32, 48 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: rldicl r6, r5, 48, 48 +; P8LE-NEXT: extsh r10, r7 +; P8LE-NEXT: rldicl r5, r5, 16, 48 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsh r9, r6 +; P8LE-NEXT: extsw r10, r10 +; P8LE-NEXT: extsh r11, r5 +; P8LE-NEXT: mulld r12, r8, r4 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: extsw r11, r11 +; P8LE-NEXT: mulld r30, r10, r4 +; P8LE-NEXT: mulld r0, r9, r4 +; P8LE-NEXT: mulld r4, r11, r4 +; P8LE-NEXT: rldicl r12, r12, 32, 32 +; P8LE-NEXT: add r8, r12, r8 +; P8LE-NEXT: rldicl r12, r30, 32, 32 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: rldicl r4, r4, 32, 32 +; P8LE-NEXT: add r10, r12, r10 +; P8LE-NEXT: add r9, r0, r9 +; P8LE-NEXT: srwi r0, r8, 31 +; P8LE-NEXT: add r4, r4, r11 +; P8LE-NEXT: srwi r11, r10, 31 +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: srawi r10, r10, 6 +; P8LE-NEXT: srwi r12, r9, 31 +; P8LE-NEXT: add r8, r8, r0 +; P8LE-NEXT: srawi r9, r9, 6 +; P8LE-NEXT: add r10, r10, r11 +; P8LE-NEXT: srwi r11, r4, 31 +; P8LE-NEXT: srawi r4, r4, 6 +; P8LE-NEXT: add r9, r9, r12 +; P8LE-NEXT: mulli r8, r8, 95 +; P8LE-NEXT: add r4, r4, r11 +; P8LE-NEXT: mulli r9, r9, 95 +; P8LE-NEXT: mulli r10, r10, 95 +; P8LE-NEXT: mulli r4, r4, 95 +; P8LE-NEXT: subf r3, r8, r3 +; P8LE-NEXT: subf r6, r9, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_srem_vec_2: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, -21386 +; P8BE-NEXT: ori r3, r3, 37253 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r8, r5, r3 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r7, r7 +; P8BE-NEXT: mulld r9, r6, r3 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r10, r7, r3 +; P8BE-NEXT: mulld r3, r4, r3 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: rldicl r9, r9, 32, 32 +; P8BE-NEXT: add r8, r8, r5 +; P8BE-NEXT: rldicl r10, r10, 32, 32 +; P8BE-NEXT: add r9, r9, r6 +; P8BE-NEXT: srwi r11, r8, 31 +; P8BE-NEXT: srawi r8, r8, 6 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: add r10, r10, r7 +; P8BE-NEXT: add r8, r8, r11 +; P8BE-NEXT: srwi r11, r9, 31 +; P8BE-NEXT: add r3, r3, r4 +; P8BE-NEXT: srawi r9, r9, 6 +; P8BE-NEXT: mulli r8, r8, 95 +; P8BE-NEXT: add r9, r9, r11 +; P8BE-NEXT: srwi r11, r10, 31 +; P8BE-NEXT: srawi r10, r10, 6 +; P8BE-NEXT: mulli r9, r9, 95 +; P8BE-NEXT: add r10, r10, r11 +; P8BE-NEXT: srwi r11, r3, 31 +; P8BE-NEXT: srawi r3, r3, 6 +; P8BE-NEXT: mulli r10, r10, 95 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: add r3, r3, r11 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: subf r6, r9, r6 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: subf r7, r10, r7 +; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: subf r3, r3, r4 +; P8BE-NEXT: sldi r4, r7, 48 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; P9LE-LABEL: combine_srem_sdiv: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r6, r4, r5 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: add r4, r6, r4 +; P9LE-NEXT: srwi r6, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r6 +; P9LE-NEXT: mulli r6, r4, 95 +; P9LE-NEXT: subf r3, r6, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r6, r3 +; P9LE-NEXT: extsw r6, r6 +; P9LE-NEXT: mulld r7, r6, r5 +; P9LE-NEXT: rldicl r7, r7, 32, 32 +; P9LE-NEXT: add r6, r7, r6 +; P9LE-NEXT: srwi r7, r6, 31 +; P9LE-NEXT: srawi r6, r6, 6 +; P9LE-NEXT: add r6, r6, r7 +; P9LE-NEXT: mulli r7, r6, 95 +; P9LE-NEXT: subf r3, r7, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r7, r3 +; P9LE-NEXT: extsw r7, r7 +; P9LE-NEXT: mulld r8, r7, r5 +; P9LE-NEXT: rldicl r8, r8, 32, 32 +; P9LE-NEXT: add r7, r8, r7 +; P9LE-NEXT: srwi r8, r7, 31 +; P9LE-NEXT: srawi r7, r7, 6 +; P9LE-NEXT: add r7, r7, r8 +; P9LE-NEXT: mulli r8, r7, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r8, r3 +; P9LE-NEXT: extsw r8, r8 +; P9LE-NEXT: mulld r5, r8, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r5, r5, r8 +; P9LE-NEXT: srwi r8, r5, 31 +; P9LE-NEXT: srawi r5, r5, 6 +; P9LE-NEXT: add r5, r5, r8 +; P9LE-NEXT: mulli r8, r5, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: mtvsrd f0, r4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r6 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r7 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r5 +; P9LE-NEXT: xxswapd v5, vs0 +; P9LE-NEXT: vmrglh v4, v5, v4 +; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: vadduhm v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: combine_srem_sdiv: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r4, r3 +; P9BE-NEXT: lis r5, -21386 +; P9BE-NEXT: ori r5, r5, 37253 +; P9BE-NEXT: extsw r4, r4 +; P9BE-NEXT: mulld r6, r4, r5 +; P9BE-NEXT: rldicl r6, r6, 32, 32 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r6, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r6 +; P9BE-NEXT: mulli r6, r4, 95 +; P9BE-NEXT: subf r3, r6, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r6, r3 +; P9BE-NEXT: extsw r6, r6 +; P9BE-NEXT: mulld r7, r6, r5 +; P9BE-NEXT: rldicl r7, r7, 32, 32 +; P9BE-NEXT: add r6, r7, r6 +; P9BE-NEXT: srwi r7, r6, 31 +; P9BE-NEXT: srawi r6, r6, 6 +; P9BE-NEXT: add r6, r6, r7 +; P9BE-NEXT: mulli r7, r6, 95 +; P9BE-NEXT: subf r3, r7, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r7, r3 +; P9BE-NEXT: extsw r7, r7 +; P9BE-NEXT: mulld r8, r7, r5 +; P9BE-NEXT: rldicl r8, r8, 32, 32 +; P9BE-NEXT: add r7, r8, r7 +; P9BE-NEXT: srwi r8, r7, 31 +; P9BE-NEXT: srawi r7, r7, 6 +; P9BE-NEXT: add r7, r7, r8 +; P9BE-NEXT: mulli r8, r7, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r5, r3, r5 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r8, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r8 +; P9BE-NEXT: mulli r8, r5, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: sldi r3, r4, 48 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: sldi r3, r6, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r7, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r5, 48 +; P9BE-NEXT: mtvsrd v5, r3 +; P9BE-NEXT: vmrghh v4, v5, v4 +; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: vadduhm v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: combine_srem_sdiv: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r5, -21386 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r5, r5, 37253 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: clrldi r3, r6, 48 +; P8LE-NEXT: rldicl r4, r6, 48, 48 +; P8LE-NEXT: rldicl r7, r6, 32, 48 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: extsh r9, r4 +; P8LE-NEXT: rldicl r6, r6, 16, 48 +; P8LE-NEXT: extsh r10, r7 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: extsh r11, r6 +; P8LE-NEXT: extsw r10, r10 +; P8LE-NEXT: mulld r12, r8, r5 +; P8LE-NEXT: extsw r11, r11 +; P8LE-NEXT: mulld r0, r9, r5 +; P8LE-NEXT: mulld r30, r10, r5 +; P8LE-NEXT: mulld r5, r11, r5 +; P8LE-NEXT: rldicl r12, r12, 32, 32 +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: rldicl r30, r30, 32, 32 +; P8LE-NEXT: add r8, r12, r8 +; P8LE-NEXT: rldicl r5, r5, 32, 32 +; P8LE-NEXT: add r9, r0, r9 +; P8LE-NEXT: add r10, r30, r10 +; P8LE-NEXT: srwi r12, r8, 31 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: srawi r0, r9, 6 +; P8LE-NEXT: srwi r9, r9, 31 +; P8LE-NEXT: add r5, r5, r11 +; P8LE-NEXT: add r8, r8, r12 +; P8LE-NEXT: srawi r12, r10, 6 +; P8LE-NEXT: srwi r10, r10, 31 +; P8LE-NEXT: add r9, r0, r9 +; P8LE-NEXT: mulli r0, r8, 95 +; P8LE-NEXT: add r10, r12, r10 +; P8LE-NEXT: mtvsrd f0, r8 +; P8LE-NEXT: srwi r8, r5, 31 +; P8LE-NEXT: srawi r5, r5, 6 +; P8LE-NEXT: mulli r11, r9, 95 +; P8LE-NEXT: mtvsrd f1, r9 +; P8LE-NEXT: mulli r9, r10, 95 +; P8LE-NEXT: add r5, r5, r8 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f2, r10 +; P8LE-NEXT: mtvsrd f3, r5 +; P8LE-NEXT: mulli r5, r5, 95 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: subf r3, r0, r3 +; P8LE-NEXT: xxswapd v1, vs2 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r4, r11, r4 +; P8LE-NEXT: xxswapd v6, vs3 +; P8LE-NEXT: subf r3, r9, r7 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f4, r3 +; P8LE-NEXT: subf r3, r5, r6 +; P8LE-NEXT: mtvsrd f5, r3 +; P8LE-NEXT: xxswapd v4, vs1 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v3, vs0 +; P8LE-NEXT: xxswapd v5, vs4 +; P8LE-NEXT: xxswapd v0, vs5 +; P8LE-NEXT: vmrglh v3, v4, v3 +; P8LE-NEXT: vmrglh v4, v0, v5 +; P8LE-NEXT: vmrglh v5, v6, v1 +; P8LE-NEXT: vmrglw v3, v4, v3 +; P8LE-NEXT: vmrglw v2, v5, v2 +; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: combine_srem_sdiv: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r5, -21386 +; P8BE-NEXT: ori r5, r5, 37253 +; P8BE-NEXT: clrldi r3, r6, 48 +; P8BE-NEXT: rldicl r4, r6, 48, 48 +; P8BE-NEXT: extsh r8, r3 +; P8BE-NEXT: rldicl r7, r6, 32, 48 +; P8BE-NEXT: extsh r9, r4 +; P8BE-NEXT: rldicl r6, r6, 16, 48 +; P8BE-NEXT: extsw r8, r8 +; P8BE-NEXT: extsh r10, r7 +; P8BE-NEXT: extsw r9, r9 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: mulld r11, r8, r5 +; P8BE-NEXT: extsw r10, r10 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r12, r9, r5 +; P8BE-NEXT: mulld r0, r10, r5 +; P8BE-NEXT: mulld r5, r6, r5 +; P8BE-NEXT: rldicl r11, r11, 32, 32 +; P8BE-NEXT: rldicl r12, r12, 32, 32 +; P8BE-NEXT: add r8, r11, r8 +; P8BE-NEXT: rldicl r0, r0, 32, 32 +; P8BE-NEXT: rldicl r5, r5, 32, 32 +; P8BE-NEXT: add r9, r12, r9 +; P8BE-NEXT: srawi r11, r8, 6 +; P8BE-NEXT: srwi r8, r8, 31 +; P8BE-NEXT: add r10, r0, r10 +; P8BE-NEXT: add r5, r5, r6 +; P8BE-NEXT: srawi r12, r9, 6 +; P8BE-NEXT: srwi r9, r9, 31 +; P8BE-NEXT: add r8, r11, r8 +; P8BE-NEXT: srawi r0, r10, 6 +; P8BE-NEXT: srawi r11, r5, 6 +; P8BE-NEXT: srwi r10, r10, 31 +; P8BE-NEXT: add r9, r12, r9 +; P8BE-NEXT: srwi r5, r5, 31 +; P8BE-NEXT: mulli r12, r8, 95 +; P8BE-NEXT: add r10, r0, r10 +; P8BE-NEXT: add r5, r11, r5 +; P8BE-NEXT: mulli r0, r9, 95 +; P8BE-NEXT: sldi r9, r9, 48 +; P8BE-NEXT: sldi r8, r8, 48 +; P8BE-NEXT: mtvsrd v3, r9 +; P8BE-NEXT: mulli r9, r5, 95 +; P8BE-NEXT: mtvsrd v2, r8 +; P8BE-NEXT: mulli r8, r10, 95 +; P8BE-NEXT: sldi r10, r10, 48 +; P8BE-NEXT: subf r3, r12, r3 +; P8BE-NEXT: mtvsrd v4, r10 +; P8BE-NEXT: subf r4, r0, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: subf r3, r9, r6 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: mtvsrd v1, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: mtvsrd v0, r6 +; P8BE-NEXT: vmrghh v3, v5, v3 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v0, v1, v0 +; P8BE-NEXT: vmrghh v4, v5, v4 +; P8BE-NEXT: vmrghw v3, v0, v3 +; P8BE-NEXT: vmrghw v2, v4, v2 +; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_srem_power_of_two: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 6 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 5 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 5 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -21386 +; P9LE-NEXT: ori r5, r5, 37253 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 6 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 3 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 3 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_srem_power_of_two: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 5 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 5 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 6 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 6 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -21386 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 3 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 3 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_srem_power_of_two: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, -21386 +; P8LE-NEXT: ori r3, r3, 37253 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r5, r4, 16, 48 +; P8LE-NEXT: clrldi r7, r4, 48 +; P8LE-NEXT: extsh r6, r5 +; P8LE-NEXT: extsh r8, r7 +; P8LE-NEXT: extsw r6, r6 +; P8LE-NEXT: rldicl r9, r4, 48, 48 +; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: extsh r10, r9 +; P8LE-NEXT: addze r8, r8 +; P8LE-NEXT: rldicl r4, r4, 32, 48 +; P8LE-NEXT: srawi r10, r10, 5 +; P8LE-NEXT: slwi r8, r8, 6 +; P8LE-NEXT: subf r7, r8, r7 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: mtvsrd f0, r7 +; P8LE-NEXT: add r3, r3, r6 +; P8LE-NEXT: addze r6, r10 +; P8LE-NEXT: srwi r10, r3, 31 +; P8LE-NEXT: srawi r3, r3, 6 +; P8LE-NEXT: slwi r6, r6, 5 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: add r3, r3, r10 +; P8LE-NEXT: extsh r10, r4 +; P8LE-NEXT: subf r6, r6, r9 +; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: srawi r8, r10, 3 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: addze r7, r8 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: slwi r5, r7, 3 +; P8LE-NEXT: subf r4, r5, r4 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_srem_power_of_two: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, -21386 +; P8BE-NEXT: ori r3, r3, 37253 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 32, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: rldicl r7, r4, 16, 48 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: srawi r8, r6, 5 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: addze r8, r8 +; P8BE-NEXT: rldicl r4, r4, 48, 48 +; P8BE-NEXT: srawi r9, r7, 6 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: slwi r8, r8, 5 +; P8BE-NEXT: addze r9, r9 +; P8BE-NEXT: subf r6, r8, r6 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: slwi r8, r9, 6 +; P8BE-NEXT: add r3, r3, r5 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: srwi r10, r3, 31 +; P8BE-NEXT: srawi r3, r3, 6 +; P8BE-NEXT: add r3, r3, r10 +; P8BE-NEXT: srawi r9, r4, 3 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: addze r8, r9 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: slwi r6, r8, 3 +; P8BE-NEXT: subf r4, r6, r4 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r5, r7, 48 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r5 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_srem_one: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -14230 +; P9LE-NEXT: ori r5, r5, 30865 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 9 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, -19946 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 17097 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 4 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 24749 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 47143 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 11 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v3, v4 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_srem_one: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 4 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 47143 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 11 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, -14230 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 30865 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 9 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: vmrghh v2, v4, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_srem_one: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, 24749 +; P8LE-NEXT: lis r8, -19946 +; P8LE-NEXT: lis r10, -14230 +; P8LE-NEXT: xxlxor v5, v5, v5 +; P8LE-NEXT: ori r3, r3, 47143 +; P8LE-NEXT: ori r8, r8, 17097 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r5, r4, 16, 48 +; P8LE-NEXT: rldicl r6, r4, 32, 48 +; P8LE-NEXT: rldicl r4, r4, 48, 48 +; P8LE-NEXT: extsh r7, r5 +; P8LE-NEXT: extsh r9, r6 +; P8LE-NEXT: extsw r7, r7 +; P8LE-NEXT: extsh r11, r4 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: mulld r3, r7, r3 +; P8LE-NEXT: ori r7, r10, 30865 +; P8LE-NEXT: extsw r10, r11 +; P8LE-NEXT: mulld r8, r9, r8 +; P8LE-NEXT: mulld r7, r10, r7 +; P8LE-NEXT: rldicl r11, r3, 1, 63 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: rldicl r8, r8, 32, 32 +; P8LE-NEXT: rldicl r7, r7, 32, 32 +; P8LE-NEXT: add r8, r8, r9 +; P8LE-NEXT: srawi r3, r3, 11 +; P8LE-NEXT: add r7, r7, r10 +; P8LE-NEXT: srwi r9, r8, 31 +; P8LE-NEXT: srawi r8, r8, 4 +; P8LE-NEXT: add r3, r3, r11 +; P8LE-NEXT: add r8, r8, r9 +; P8LE-NEXT: srwi r9, r7, 31 +; P8LE-NEXT: srawi r7, r7, 9 +; P8LE-NEXT: mulli r3, r3, 5423 +; P8LE-NEXT: add r7, r7, r9 +; P8LE-NEXT: mulli r8, r8, 23 +; P8LE-NEXT: mulli r7, r7, 654 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r8, r6 +; P8LE-NEXT: subf r4, r7, r4 +; P8LE-NEXT: mtvsrd f1, r3 +; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_srem_one: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 24749 +; P8BE-NEXT: lis r7, -19946 +; P8BE-NEXT: lis r8, -14230 +; P8BE-NEXT: ori r3, r3, 47143 +; P8BE-NEXT: ori r7, r7, 17097 +; P8BE-NEXT: ori r8, r8, 30865 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: extsw r4, r4 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: mulld r7, r6, r7 +; P8BE-NEXT: mulld r8, r4, r8 +; P8BE-NEXT: rldicl r9, r3, 1, 63 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rldicl r7, r7, 32, 32 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: srawi r3, r3, 11 +; P8BE-NEXT: add r7, r7, r6 +; P8BE-NEXT: add r8, r8, r4 +; P8BE-NEXT: add r3, r3, r9 +; P8BE-NEXT: srwi r9, r7, 31 +; P8BE-NEXT: srawi r7, r7, 4 +; P8BE-NEXT: mulli r3, r3, 5423 +; P8BE-NEXT: add r7, r7, r9 +; P8BE-NEXT: srwi r9, r8, 31 +; P8BE-NEXT: srawi r8, r8, 9 +; P8BE-NEXT: mulli r7, r7, 23 +; P8BE-NEXT: add r8, r8, r9 +; P8BE-NEXT: li r9, 0 +; P8BE-NEXT: mulli r8, r8, 654 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r5, r9, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: subf r5, r7, r6 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: subf r4, r8, r4 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v3, v4, v3 +; P8BE-NEXT: vmrghh v2, v2, v5 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_urem_i16_smax: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: lis r5, -19946 +; P9LE-NEXT: ori r5, r5, 17097 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: mulld r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: srwi r5, r4, 31 +; P9LE-NEXT: srawi r4, r4, 4 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, 24749 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: extsw r4, r4 +; P9LE-NEXT: ori r5, r5, 47143 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: rldicl r4, r4, 32, 32 +; P9LE-NEXT: srawi r4, r4, 11 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: extsh r4, r3 +; P9LE-NEXT: srawi r4, r4, 15 +; P9LE-NEXT: addze r4, r4 +; P9LE-NEXT: slwi r4, r4, 15 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_i16_smax: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r5, r4, 31 +; P9BE-NEXT: srawi r4, r4, 4 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: extsw r3, r3 +; P9BE-NEXT: ori r4, r4, 47143 +; P9BE-NEXT: mulld r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: srawi r4, r4, 11 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: srawi r4, r3, 15 +; P9BE-NEXT: addze r4, r4 +; P9BE-NEXT: slwi r4, r4, 15 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v3, v4 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: vmrghh v2, v4, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_i16_smax: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r6, 24749 +; P8LE-NEXT: lis r7, -19946 +; P8LE-NEXT: xxlxor v5, v5, v5 +; P8LE-NEXT: ori r6, r6, 47143 +; P8LE-NEXT: ori r7, r7, 17097 +; P8LE-NEXT: mfvsrd r3, f0 +; P8LE-NEXT: rldicl r4, r3, 16, 48 +; P8LE-NEXT: rldicl r5, r3, 32, 48 +; P8LE-NEXT: extsh r8, r4 +; P8LE-NEXT: extsh r9, r5 +; P8LE-NEXT: extsw r8, r8 +; P8LE-NEXT: extsw r9, r9 +; P8LE-NEXT: mulld r6, r8, r6 +; P8LE-NEXT: mulld r7, r9, r7 +; P8LE-NEXT: rldicl r3, r3, 48, 48 +; P8LE-NEXT: rldicl r8, r6, 32, 32 +; P8LE-NEXT: rldicl r7, r7, 32, 32 +; P8LE-NEXT: rldicl r6, r6, 1, 63 +; P8LE-NEXT: srawi r8, r8, 11 +; P8LE-NEXT: add r7, r7, r9 +; P8LE-NEXT: add r6, r8, r6 +; P8LE-NEXT: srwi r8, r7, 31 +; P8LE-NEXT: srawi r7, r7, 4 +; P8LE-NEXT: mulli r6, r6, 5423 +; P8LE-NEXT: add r7, r7, r8 +; P8LE-NEXT: extsh r8, r3 +; P8LE-NEXT: mulli r7, r7, 23 +; P8LE-NEXT: srawi r8, r8, 15 +; P8LE-NEXT: subf r4, r6, r4 +; P8LE-NEXT: addze r6, r8 +; P8LE-NEXT: mtvsrd f0, r4 +; P8LE-NEXT: slwi r4, r6, 15 +; P8LE-NEXT: subf r5, r7, r5 +; P8LE-NEXT: subf r3, r4, r3 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v2, v3 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_i16_smax: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 24749 +; P8BE-NEXT: lis r7, -19946 +; P8BE-NEXT: ori r3, r3, 47143 +; P8BE-NEXT: ori r7, r7, 17097 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: extsh r5, r5 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsw r5, r5 +; P8BE-NEXT: extsw r6, r6 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: mulld r7, r6, r7 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: rldicl r8, r3, 1, 63 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rldicl r7, r7, 32, 32 +; P8BE-NEXT: srawi r3, r3, 11 +; P8BE-NEXT: add r7, r7, r6 +; P8BE-NEXT: add r3, r3, r8 +; P8BE-NEXT: srwi r8, r7, 31 +; P8BE-NEXT: srawi r7, r7, 4 +; P8BE-NEXT: mulli r3, r3, 5423 +; P8BE-NEXT: add r7, r7, r8 +; P8BE-NEXT: li r8, 0 +; P8BE-NEXT: mulli r7, r7, 23 +; P8BE-NEXT: srawi r9, r4, 15 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: sldi r5, r8, 48 +; P8BE-NEXT: addze r8, r9 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: subf r5, r7, r6 +; P8BE-NEXT: slwi r6, r8, 15 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: subf r4, r6, r4 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v3, v4, v3 +; P8BE-NEXT: vmrghh v2, v2, v5 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; P9LE-LABEL: dont_fold_srem_i64: +; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 24749 +; P9LE-NEXT: ori r4, r4, 47142 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 58853 +; P9LE-NEXT: mfvsrd r3, v3 +; P9LE-NEXT: ori r4, r4, 6055 +; P9LE-NEXT: mulhd r4, r3, r4 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: sradi r4, r4, 11 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: lis r5, -19946 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: ori r5, r5, 17096 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 22795 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: mfvsrld r4, v3 +; P9LE-NEXT: ori r5, r5, 8549 +; P9LE-NEXT: mulhd r5, r4, r5 +; P9LE-NEXT: add r5, r5, r4 +; P9LE-NEXT: rldicl r6, r5, 1, 63 +; P9LE-NEXT: sradi r5, r5, 4 +; P9LE-NEXT: add r5, r5, r6 +; P9LE-NEXT: mulli r5, r5, 23 +; P9LE-NEXT: sub r4, r4, r5 +; P9LE-NEXT: mtvsrdd v3, r3, r4 +; P9LE-NEXT: lis r4, 25653 +; P9LE-NEXT: ori r4, r4, 15432 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 1603 +; P9LE-NEXT: mfvsrd r3, v2 +; P9LE-NEXT: ori r4, r4, 21445 +; P9LE-NEXT: mulhd r4, r3, r4 +; P9LE-NEXT: rldicl r5, r4, 1, 63 +; P9LE-NEXT: sradi r4, r4, 8 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: li r4, 0 +; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_srem_i64: +; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 24749 +; P9BE-NEXT: ori r4, r4, 47142 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 58853 +; P9BE-NEXT: mfvsrld r3, v3 +; P9BE-NEXT: ori r4, r4, 6055 +; P9BE-NEXT: mulhd r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: sradi r4, r4, 11 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: lis r5, -19946 +; P9BE-NEXT: ori r5, r5, 17096 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 22795 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mfvsrd r4, v3 +; P9BE-NEXT: ori r5, r5, 8549 +; P9BE-NEXT: mulhd r5, r4, r5 +; P9BE-NEXT: add r5, r5, r4 +; P9BE-NEXT: rldicl r6, r5, 1, 63 +; P9BE-NEXT: sradi r5, r5, 4 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r5, r5, 23 +; P9BE-NEXT: sub r4, r4, r5 +; P9BE-NEXT: mtvsrdd v3, r4, r3 +; P9BE-NEXT: lis r4, 25653 +; P9BE-NEXT: ori r4, r4, 15432 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 1603 +; P9BE-NEXT: mfvsrld r3, v2 +; P9BE-NEXT: ori r4, r4, 21445 +; P9BE-NEXT: mulhd r4, r3, r4 +; P9BE-NEXT: rldicl r5, r4, 1, 63 +; P9BE-NEXT: sradi r4, r4, 8 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_srem_i64: +; P8LE: # %bb.0: +; P8LE-NEXT: lis r3, 24749 +; P8LE-NEXT: lis r4, -19946 +; P8LE-NEXT: lis r5, 25653 +; P8LE-NEXT: xxswapd vs0, v3 +; P8LE-NEXT: mfvsrd r6, v3 +; P8LE-NEXT: ori r3, r3, 47142 +; P8LE-NEXT: ori r4, r4, 17096 +; P8LE-NEXT: ori r5, r5, 15432 +; P8LE-NEXT: mfvsrd r7, v2 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r3, r3, 58853 +; P8LE-NEXT: oris r4, r4, 22795 +; P8LE-NEXT: mfvsrd r8, f0 +; P8LE-NEXT: oris r5, r5, 1603 +; P8LE-NEXT: ori r3, r3, 6055 +; P8LE-NEXT: ori r4, r4, 8549 +; P8LE-NEXT: ori r5, r5, 21445 +; P8LE-NEXT: mulhd r3, r6, r3 +; P8LE-NEXT: mulhd r5, r7, r5 +; P8LE-NEXT: mulhd r4, r8, r4 +; P8LE-NEXT: rldicl r9, r3, 1, 63 +; P8LE-NEXT: sradi r3, r3, 11 +; P8LE-NEXT: add r3, r3, r9 +; P8LE-NEXT: rldicl r9, r5, 1, 63 +; P8LE-NEXT: add r4, r4, r8 +; P8LE-NEXT: sradi r5, r5, 8 +; P8LE-NEXT: mulli r3, r3, 5423 +; P8LE-NEXT: add r5, r5, r9 +; P8LE-NEXT: rldicl r9, r4, 1, 63 +; P8LE-NEXT: sradi r4, r4, 4 +; P8LE-NEXT: mulli r5, r5, 654 +; P8LE-NEXT: add r4, r4, r9 +; P8LE-NEXT: mulli r4, r4, 23 +; P8LE-NEXT: sub r3, r6, r3 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: sub r5, r7, r5 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: sub r3, r8, r4 +; P8LE-NEXT: li r4, 0 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxmrghd v3, vs0, vs2 +; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_srem_i64: +; P8BE: # %bb.0: +; P8BE-NEXT: lis r4, -19946 +; P8BE-NEXT: lis r3, 24749 +; P8BE-NEXT: xxswapd vs0, v3 +; P8BE-NEXT: lis r5, 25653 +; P8BE-NEXT: xxswapd vs1, v2 +; P8BE-NEXT: ori r4, r4, 17096 +; P8BE-NEXT: ori r3, r3, 47142 +; P8BE-NEXT: ori r5, r5, 15432 +; P8BE-NEXT: mfvsrd r6, v3 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: oris r4, r4, 22795 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: oris r3, r3, 58853 +; P8BE-NEXT: mfvsrd r7, f0 +; P8BE-NEXT: ori r4, r4, 8549 +; P8BE-NEXT: ori r3, r3, 6055 +; P8BE-NEXT: oris r5, r5, 1603 +; P8BE-NEXT: mfvsrd r8, f1 +; P8BE-NEXT: mulhd r4, r6, r4 +; P8BE-NEXT: mulhd r3, r7, r3 +; P8BE-NEXT: ori r5, r5, 21445 +; P8BE-NEXT: mulhd r5, r8, r5 +; P8BE-NEXT: add r4, r4, r6 +; P8BE-NEXT: rldicl r9, r3, 1, 63 +; P8BE-NEXT: sradi r3, r3, 11 +; P8BE-NEXT: rldicl r10, r4, 1, 63 +; P8BE-NEXT: sradi r4, r4, 4 +; P8BE-NEXT: add r3, r3, r9 +; P8BE-NEXT: rldicl r9, r5, 1, 63 +; P8BE-NEXT: add r4, r4, r10 +; P8BE-NEXT: sradi r5, r5, 8 +; P8BE-NEXT: mulli r3, r3, 5423 +; P8BE-NEXT: add r5, r5, r9 +; P8BE-NEXT: mulli r4, r4, 23 +; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: sub r3, r7, r3 +; P8BE-NEXT: sub r4, r6, r4 +; P8BE-NEXT: mtvsrd f0, r3 +; P8BE-NEXT: sub r3, r8, r5 +; P8BE-NEXT: mtvsrd f1, r4 +; P8BE-NEXT: li r4, 0 +; P8BE-NEXT: mtvsrd f2, r3 +; P8BE-NEXT: mtvsrd f3, r4 +; P8BE-NEXT: xxmrghd v3, vs1, vs0 +; P8BE-NEXT: xxmrghd v2, vs3, vs2 +; P8BE-NEXT: blr + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll new file mode 100644 index 00000000000000..f361200d54fda7 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc64 < %s | FileCheck -check-prefixes=CHECK,CHECK64 %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc-unknown-linux-gnu -mcpu=ppc < %s | FileCheck -check-prefixes=CHECK,CHECK32 %s + +define i32 @fold_urem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, 22765 +; CHECK-NEXT: ori 4, 4, 8969 +; CHECK-NEXT: mulhwu 4, 3, 4 +; CHECK-NEXT: subf 5, 4, 3 +; CHECK-NEXT: srwi 5, 5, 1 +; CHECK-NEXT: add 4, 5, 4 +; CHECK-NEXT: srwi 4, 4, 6 +; CHECK-NEXT: mulli 4, 4, 95 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, -2226 +; CHECK-NEXT: ori 4, 4, 16323 +; CHECK-NEXT: mulhwu 4, 3, 4 +; CHECK-NEXT: srwi 4, 4, 10 +; CHECK-NEXT: mulli 4, 4, 1060 +; CHECK-NEXT: subf 3, 4, 3 +; CHECK-NEXT: blr + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: # %bb.0: +; CHECK-NEXT: lis 4, 22765 +; CHECK-NEXT: ori 4, 4, 8969 +; CHECK-NEXT: mulhwu 4, 3, 4 +; CHECK-NEXT: subf 5, 4, 3 +; CHECK-NEXT: srwi 5, 5, 1 +; CHECK-NEXT: add 4, 5, 4 +; CHECK-NEXT: srwi 4, 4, 6 +; CHECK-NEXT: mulli 5, 4, 95 +; CHECK-NEXT: subf 3, 5, 3 +; CHECK-NEXT: add 3, 3, 4 +; CHECK-NEXT: blr + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: clrlwi 3, 3, 26 +; CHECK-NEXT: blr + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: li 3, 0 +; CHECK-NEXT: blr + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: blr + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: stw 0, 4(1) +; CHECK-NEXT: stwu 1, -16(1) +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset lr, 4 +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: li 6, 98 +; CHECK-NEXT: bl __umoddi3@PLT +; CHECK-NEXT: lwz 0, 20(1) +; CHECK-NEXT: addi 1, 1, 16 +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr + %1 = urem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll new file mode 100644 index 00000000000000..e3d9027d9e98c6 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -0,0 +1,1338 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9LE +; RUN: llc -mcpu=pwr9 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P9BE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8LE +; RUN: llc -mcpu=pwr8 -verify-machineinstrs -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \ +; RUN: -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,P8BE + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; P9LE-LABEL: fold_urem_vec_1: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: lis r5, 21399 +; P9LE-NEXT: ori r5, r5, 33437 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: lis r5, 16727 +; P9LE-NEXT: ori r5, r5, 2287 +; P9LE-NEXT: rldicl r4, r4, 27, 37 +; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: lis r5, 8456 +; P9LE-NEXT: ori r5, r5, 16913 +; P9LE-NEXT: rldicl r4, r4, 24, 40 +; P9LE-NEXT: mulli r4, r4, 1003 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 30, 18, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r4, r4, 30, 34 +; P9LE-NEXT: mulli r4, r4, 124 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_urem_vec_1: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 16727 +; P9BE-NEXT: ori r5, r5, 2287 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: lis r5, 21399 +; P9BE-NEXT: ori r5, r5, 33437 +; P9BE-NEXT: rldicl r4, r4, 24, 40 +; P9BE-NEXT: mulli r4, r4, 1003 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: lis r5, 8456 +; P9BE-NEXT: ori r5, r5, 16913 +; P9BE-NEXT: rldicl r4, r4, 27, 37 +; P9BE-NEXT: mulli r4, r4, 98 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: clrlwi r4, r3, 16 +; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: rldicl r3, r3, 30, 34 +; P9BE-NEXT: mulli r3, r3, 124 +; P9BE-NEXT: subf r3, r3, r4 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r5, r4, r3 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_urem_vec_1: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, 22765 +; P8LE-NEXT: lis r8, 21399 +; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: ori r8, r8, 33437 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r9, r4, 32, 48 +; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 +; P8LE-NEXT: rldicl r10, r4, 16, 48 +; P8LE-NEXT: rlwinm r11, r9, 0, 16, 31 +; P8LE-NEXT: clrldi r7, r6, 32 +; P8LE-NEXT: rlwinm r12, r10, 0, 16, 31 +; P8LE-NEXT: mulld r3, r7, r3 +; P8LE-NEXT: lis r7, 16727 +; P8LE-NEXT: ori r7, r7, 2287 +; P8LE-NEXT: mulld r8, r11, r8 +; P8LE-NEXT: lis r11, 8456 +; P8LE-NEXT: rldicl r4, r4, 48, 48 +; P8LE-NEXT: mulld r7, r12, r7 +; P8LE-NEXT: ori r11, r11, 16913 +; P8LE-NEXT: rlwinm r12, r4, 30, 18, 31 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: mulld r11, r12, r11 +; P8LE-NEXT: subf r6, r3, r6 +; P8LE-NEXT: rldicl r8, r8, 27, 37 +; P8LE-NEXT: srwi r6, r6, 1 +; P8LE-NEXT: add r3, r6, r3 +; P8LE-NEXT: rldicl r6, r7, 24, 40 +; P8LE-NEXT: mulli r7, r8, 98 +; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: rldicl r8, r11, 30, 34 +; P8LE-NEXT: mulli r6, r6, 1003 +; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: mulli r8, r8, 124 +; P8LE-NEXT: subf r7, r7, r9 +; P8LE-NEXT: subf r6, r6, r10 +; P8LE-NEXT: mtvsrd f0, r7 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: subf r4, r8, r4 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_urem_vec_1: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 22765 +; P8BE-NEXT: lis r9, 16727 +; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: ori r9, r9, 2287 +; P8BE-NEXT: rldicl r5, r4, 16, 48 +; P8BE-NEXT: clrldi r6, r4, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rldicl r7, r4, 48, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: clrldi r8, r5, 32 +; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8BE-NEXT: mulld r3, r8, r3 +; P8BE-NEXT: lis r8, 21399 +; P8BE-NEXT: clrldi r10, r6, 32 +; P8BE-NEXT: ori r8, r8, 33437 +; P8BE-NEXT: clrldi r11, r7, 32 +; P8BE-NEXT: mulld r9, r10, r9 +; P8BE-NEXT: lis r10, 8456 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: mulld r8, r11, r8 +; P8BE-NEXT: ori r10, r10, 16913 +; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8BE-NEXT: mulld r10, r11, r10 +; P8BE-NEXT: subf r11, r3, r5 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: rldicl r9, r9, 24, 40 +; P8BE-NEXT: add r3, r11, r3 +; P8BE-NEXT: rldicl r8, r8, 27, 37 +; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: mulli r9, r9, 1003 +; P8BE-NEXT: rldicl r10, r10, 30, 34 +; P8BE-NEXT: mulli r8, r8, 98 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: mulli r10, r10, 124 +; P8BE-NEXT: subf r6, r9, r6 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: subf r4, r10, r4 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r5, r7, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r5 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v4, v5 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; P9LE-LABEL: fold_urem_vec_2: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: fold_urem_vec_2: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r6, r4, r3 +; P9BE-NEXT: srwi r6, r6, 1 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r6, r4, r3 +; P9BE-NEXT: srwi r6, r6, 1 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r6, r4, r3 +; P9BE-NEXT: srwi r6, r6, 1 +; P9BE-NEXT: add r4, r6, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r5, r4, r3 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: fold_urem_vec_2: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r4, 22765 +; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r4, r4, 8969 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: clrldi r3, r5, 48 +; P8LE-NEXT: rldicl r6, r5, 48, 48 +; P8LE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P8LE-NEXT: rldicl r7, r5, 32, 48 +; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 +; P8LE-NEXT: rldicl r5, r5, 16, 48 +; P8LE-NEXT: clrldi r11, r8, 32 +; P8LE-NEXT: rlwinm r10, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r12, r5, 0, 16, 31 +; P8LE-NEXT: mulld r11, r11, r4 +; P8LE-NEXT: clrldi r0, r9, 32 +; P8LE-NEXT: clrldi r30, r10, 32 +; P8LE-NEXT: clrldi r29, r12, 32 +; P8LE-NEXT: mulld r0, r0, r4 +; P8LE-NEXT: mulld r30, r30, r4 +; P8LE-NEXT: mulld r4, r29, r4 +; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; P8LE-NEXT: rldicl r11, r11, 32, 32 +; P8LE-NEXT: subf r8, r11, r8 +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: srwi r8, r8, 1 +; P8LE-NEXT: rldicl r30, r30, 32, 32 +; P8LE-NEXT: rldicl r4, r4, 32, 32 +; P8LE-NEXT: subf r9, r0, r9 +; P8LE-NEXT: add r8, r8, r11 +; P8LE-NEXT: subf r10, r30, r10 +; P8LE-NEXT: subf r11, r4, r12 +; P8LE-NEXT: srwi r9, r9, 1 +; P8LE-NEXT: srwi r8, r8, 6 +; P8LE-NEXT: srwi r10, r10, 1 +; P8LE-NEXT: srwi r11, r11, 1 +; P8LE-NEXT: add r9, r9, r0 +; P8LE-NEXT: add r10, r10, r30 +; P8LE-NEXT: add r4, r11, r4 +; P8LE-NEXT: srwi r9, r9, 6 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: mulli r8, r8, 95 +; P8LE-NEXT: srwi r10, r10, 6 +; P8LE-NEXT: srwi r4, r4, 6 +; P8LE-NEXT: mulli r9, r9, 95 +; P8LE-NEXT: mulli r10, r10, 95 +; P8LE-NEXT: mulli r4, r4, 95 +; P8LE-NEXT: subf r3, r8, r3 +; P8LE-NEXT: subf r6, r9, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r10, r7 +; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v5, v4 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: fold_urem_vec_2: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 22765 +; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: clrldi r8, r5, 32 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8BE-NEXT: clrldi r9, r6, 32 +; P8BE-NEXT: mulld r8, r8, r3 +; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8BE-NEXT: clrldi r10, r7, 32 +; P8BE-NEXT: mulld r9, r9, r3 +; P8BE-NEXT: clrldi r11, r4, 32 +; P8BE-NEXT: mulld r10, r10, r3 +; P8BE-NEXT: mulld r3, r11, r3 +; P8BE-NEXT: rldicl r8, r8, 32, 32 +; P8BE-NEXT: rldicl r9, r9, 32, 32 +; P8BE-NEXT: subf r11, r8, r5 +; P8BE-NEXT: rldicl r10, r10, 32, 32 +; P8BE-NEXT: subf r12, r9, r6 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: add r8, r11, r8 +; P8BE-NEXT: subf r11, r10, r7 +; P8BE-NEXT: srwi r12, r12, 1 +; P8BE-NEXT: add r9, r12, r9 +; P8BE-NEXT: subf r12, r3, r4 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: srwi r8, r8, 6 +; P8BE-NEXT: add r10, r11, r10 +; P8BE-NEXT: srwi r11, r12, 1 +; P8BE-NEXT: srwi r9, r9, 6 +; P8BE-NEXT: add r3, r11, r3 +; P8BE-NEXT: srwi r10, r10, 6 +; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: mulli r8, r8, 95 +; P8BE-NEXT: mulli r9, r9, 95 +; P8BE-NEXT: mulli r10, r10, 95 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: subf r5, r8, r5 +; P8BE-NEXT: subf r6, r9, r6 +; P8BE-NEXT: subf r7, r10, r7 +; P8BE-NEXT: subf r3, r3, r4 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: sldi r4, r7, 48 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v3, v2 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; P9LE-LABEL: combine_urem_udiv: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r5, r4, 95 +; P9LE-NEXT: subf r3, r5, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r5, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r7, r5, 32 +; P9LE-NEXT: mulld r7, r7, r6 +; P9LE-NEXT: rldicl r7, r7, 32, 32 +; P9LE-NEXT: subf r5, r7, r5 +; P9LE-NEXT: srwi r5, r5, 1 +; P9LE-NEXT: add r5, r5, r7 +; P9LE-NEXT: srwi r5, r5, 6 +; P9LE-NEXT: mulli r7, r5, 95 +; P9LE-NEXT: subf r3, r7, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r7, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r8, r7, 32 +; P9LE-NEXT: mulld r8, r8, r6 +; P9LE-NEXT: rldicl r8, r8, 32, 32 +; P9LE-NEXT: subf r7, r8, r7 +; P9LE-NEXT: srwi r7, r7, 1 +; P9LE-NEXT: add r7, r7, r8 +; P9LE-NEXT: srwi r7, r7, 6 +; P9LE-NEXT: mulli r8, r7, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P9LE-NEXT: clrldi r9, r8, 32 +; P9LE-NEXT: mulld r6, r9, r6 +; P9LE-NEXT: rldicl r6, r6, 32, 32 +; P9LE-NEXT: subf r8, r6, r8 +; P9LE-NEXT: srwi r8, r8, 1 +; P9LE-NEXT: add r6, r8, r6 +; P9LE-NEXT: srwi r6, r6, 6 +; P9LE-NEXT: mulli r8, r6, 95 +; P9LE-NEXT: subf r3, r8, r3 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: mtvsrd f0, r4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r5 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r7 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r6 +; P9LE-NEXT: xxswapd v5, vs0 +; P9LE-NEXT: vmrglh v4, v5, v4 +; P9LE-NEXT: vmrglw v3, v4, v3 +; P9LE-NEXT: vadduhm v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: combine_urem_udiv: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9BE-NEXT: lis r6, 22765 +; P9BE-NEXT: ori r6, r6, 8969 +; P9BE-NEXT: clrldi r5, r4, 32 +; P9BE-NEXT: mulld r5, r5, r6 +; P9BE-NEXT: rldicl r5, r5, 32, 32 +; P9BE-NEXT: subf r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 1 +; P9BE-NEXT: add r4, r4, r5 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r5, r4, 95 +; P9BE-NEXT: subf r3, r5, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r5, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r7, r5, 32 +; P9BE-NEXT: mulld r7, r7, r6 +; P9BE-NEXT: rldicl r7, r7, 32, 32 +; P9BE-NEXT: subf r5, r7, r5 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r5, r5, r7 +; P9BE-NEXT: srwi r5, r5, 6 +; P9BE-NEXT: mulli r7, r5, 95 +; P9BE-NEXT: subf r3, r7, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r7, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r8, r7, 32 +; P9BE-NEXT: mulld r8, r8, r6 +; P9BE-NEXT: rldicl r8, r8, 32, 32 +; P9BE-NEXT: subf r7, r8, r7 +; P9BE-NEXT: srwi r7, r7, 1 +; P9BE-NEXT: add r7, r7, r8 +; P9BE-NEXT: srwi r7, r7, 6 +; P9BE-NEXT: mulli r8, r7, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r8, r3, 32 +; P9BE-NEXT: mulld r6, r8, r6 +; P9BE-NEXT: rldicl r6, r6, 32, 32 +; P9BE-NEXT: subf r8, r6, r3 +; P9BE-NEXT: srwi r8, r8, 1 +; P9BE-NEXT: add r6, r8, r6 +; P9BE-NEXT: srwi r6, r6, 6 +; P9BE-NEXT: mulli r8, r6, 95 +; P9BE-NEXT: subf r3, r8, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: sldi r3, r4, 48 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: sldi r3, r5, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r7, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: sldi r3, r6, 48 +; P9BE-NEXT: mtvsrd v5, r3 +; P9BE-NEXT: vmrghh v4, v5, v4 +; P9BE-NEXT: vmrghw v3, v4, v3 +; P9BE-NEXT: vadduhm v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: combine_urem_udiv: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r5, 22765 +; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; P8LE-NEXT: ori r5, r5, 8969 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: clrldi r3, r6, 48 +; P8LE-NEXT: rldicl r4, r6, 48, 48 +; P8LE-NEXT: rldicl r7, r6, 32, 48 +; P8LE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P8LE-NEXT: rlwinm r9, r4, 0, 16, 31 +; P8LE-NEXT: rldicl r6, r6, 16, 48 +; P8LE-NEXT: rlwinm r10, r7, 0, 16, 31 +; P8LE-NEXT: clrldi r11, r8, 32 +; P8LE-NEXT: rlwinm r12, r6, 0, 16, 31 +; P8LE-NEXT: clrldi r0, r9, 32 +; P8LE-NEXT: clrldi r30, r10, 32 +; P8LE-NEXT: mulld r11, r11, r5 +; P8LE-NEXT: clrldi r29, r12, 32 +; P8LE-NEXT: mulld r0, r0, r5 +; P8LE-NEXT: mulld r30, r30, r5 +; P8LE-NEXT: mulld r5, r29, r5 +; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; P8LE-NEXT: rldicl r11, r11, 32, 32 +; P8LE-NEXT: rldicl r0, r0, 32, 32 +; P8LE-NEXT: rldicl r30, r30, 32, 32 +; P8LE-NEXT: subf r8, r11, r8 +; P8LE-NEXT: rldicl r5, r5, 32, 32 +; P8LE-NEXT: subf r9, r0, r9 +; P8LE-NEXT: srwi r8, r8, 1 +; P8LE-NEXT: subf r10, r30, r10 +; P8LE-NEXT: add r8, r8, r11 +; P8LE-NEXT: srwi r9, r9, 1 +; P8LE-NEXT: srwi r10, r10, 1 +; P8LE-NEXT: subf r11, r5, r12 +; P8LE-NEXT: add r9, r9, r0 +; P8LE-NEXT: srwi r8, r8, 6 +; P8LE-NEXT: add r10, r10, r30 +; P8LE-NEXT: srwi r11, r11, 1 +; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8LE-NEXT: srwi r9, r9, 6 +; P8LE-NEXT: mulli r12, r8, 95 +; P8LE-NEXT: srwi r10, r10, 6 +; P8LE-NEXT: add r5, r11, r5 +; P8LE-NEXT: mtvsrd f0, r8 +; P8LE-NEXT: mulli r8, r9, 95 +; P8LE-NEXT: mtvsrd f1, r9 +; P8LE-NEXT: mulli r9, r10, 95 +; P8LE-NEXT: srwi r5, r5, 6 +; P8LE-NEXT: mtvsrd f3, r5 +; P8LE-NEXT: mulli r5, r5, 95 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: mtvsrd f2, r10 +; P8LE-NEXT: subf r3, r12, r3 +; P8LE-NEXT: xxswapd v6, vs3 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: subf r3, r9, r7 +; P8LE-NEXT: subf r4, r8, r4 +; P8LE-NEXT: xxswapd v1, vs2 +; P8LE-NEXT: mtvsrd f4, r3 +; P8LE-NEXT: subf r3, r5, r6 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f5, r3 +; P8LE-NEXT: xxswapd v5, vs4 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v3, vs0 +; P8LE-NEXT: xxswapd v4, vs1 +; P8LE-NEXT: xxswapd v0, vs5 +; P8LE-NEXT: vmrglh v3, v4, v3 +; P8LE-NEXT: vmrglh v4, v0, v5 +; P8LE-NEXT: vmrglh v5, v6, v1 +; P8LE-NEXT: vmrglw v3, v4, v3 +; P8LE-NEXT: vmrglw v2, v5, v2 +; P8LE-NEXT: vadduhm v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: combine_urem_udiv: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r5, 22765 +; P8BE-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; P8BE-NEXT: ori r5, r5, 8969 +; P8BE-NEXT: clrldi r3, r6, 48 +; P8BE-NEXT: rldicl r4, r6, 48, 48 +; P8BE-NEXT: rlwinm r8, r3, 0, 16, 31 +; P8BE-NEXT: rldicl r7, r6, 32, 48 +; P8BE-NEXT: rlwinm r9, r4, 0, 16, 31 +; P8BE-NEXT: rldicl r6, r6, 16, 48 +; P8BE-NEXT: clrldi r11, r8, 32 +; P8BE-NEXT: rlwinm r10, r7, 0, 16, 31 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: clrldi r12, r9, 32 +; P8BE-NEXT: mulld r11, r11, r5 +; P8BE-NEXT: clrldi r0, r10, 32 +; P8BE-NEXT: clrldi r30, r6, 32 +; P8BE-NEXT: mulld r12, r12, r5 +; P8BE-NEXT: mulld r0, r0, r5 +; P8BE-NEXT: mulld r5, r30, r5 +; P8BE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8BE-NEXT: rldicl r11, r11, 32, 32 +; P8BE-NEXT: rldicl r12, r12, 32, 32 +; P8BE-NEXT: subf r8, r11, r8 +; P8BE-NEXT: rldicl r5, r5, 32, 32 +; P8BE-NEXT: subf r9, r12, r9 +; P8BE-NEXT: srwi r8, r8, 1 +; P8BE-NEXT: rldicl r0, r0, 32, 32 +; P8BE-NEXT: add r8, r8, r11 +; P8BE-NEXT: srwi r9, r9, 1 +; P8BE-NEXT: subf r11, r5, r6 +; P8BE-NEXT: subf r10, r0, r10 +; P8BE-NEXT: add r9, r9, r12 +; P8BE-NEXT: srwi r8, r8, 6 +; P8BE-NEXT: srwi r11, r11, 1 +; P8BE-NEXT: srwi r10, r10, 1 +; P8BE-NEXT: srwi r9, r9, 6 +; P8BE-NEXT: add r5, r11, r5 +; P8BE-NEXT: mulli r12, r8, 95 +; P8BE-NEXT: add r10, r10, r0 +; P8BE-NEXT: srwi r5, r5, 6 +; P8BE-NEXT: mulli r11, r9, 95 +; P8BE-NEXT: sldi r9, r9, 48 +; P8BE-NEXT: srwi r10, r10, 6 +; P8BE-NEXT: sldi r8, r8, 48 +; P8BE-NEXT: mtvsrd v3, r9 +; P8BE-NEXT: mulli r9, r5, 95 +; P8BE-NEXT: mtvsrd v2, r8 +; P8BE-NEXT: mulli r8, r10, 95 +; P8BE-NEXT: sldi r10, r10, 48 +; P8BE-NEXT: subf r3, r12, r3 +; P8BE-NEXT: mtvsrd v4, r10 +; P8BE-NEXT: subf r4, r11, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: subf r3, r9, r6 +; P8BE-NEXT: subf r7, r8, r7 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: mtvsrd v1, r3 +; P8BE-NEXT: sldi r3, r5, 48 +; P8BE-NEXT: mtvsrd v0, r6 +; P8BE-NEXT: vmrghh v3, v5, v3 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v0, v1, v0 +; P8BE-NEXT: vmrghh v4, v5, v4 +; P8BE-NEXT: vmrghw v3, v0, v3 +; P8BE-NEXT: vmrghw v2, v4, v2 +; P8BE-NEXT: vadduhm v2, v3, v2 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_urem_power_of_two: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r3, r3, 0, 26, 31 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r3, r3, 0, 27, 31 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: lis r6, 22765 +; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: clrldi r5, r4, 32 +; P9LE-NEXT: mulld r5, r5, r6 +; P9LE-NEXT: rldicl r5, r5, 32, 32 +; P9LE-NEXT: subf r4, r5, r4 +; P9LE-NEXT: srwi r4, r4, 1 +; P9LE-NEXT: add r4, r4, r5 +; P9LE-NEXT: srwi r4, r4, 6 +; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglw v2, v2, v3 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_power_of_two: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 27, 31 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 26, 31 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 32, 32 +; P9BE-NEXT: subf r5, r4, r3 +; P9BE-NEXT: srwi r5, r5, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulli r4, r4, 95 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghw v2, v3, v2 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_power_of_two: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: lis r3, 22765 +; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r5, r4, 16, 48 +; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 +; P8LE-NEXT: clrldi r7, r6, 32 +; P8LE-NEXT: mulld r3, r7, r3 +; P8LE-NEXT: rldicl r7, r4, 48, 48 +; P8LE-NEXT: rlwinm r7, r7, 0, 27, 31 +; P8LE-NEXT: mtvsrd f1, r7 +; P8LE-NEXT: rldicl r3, r3, 32, 32 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: subf r6, r3, r6 +; P8LE-NEXT: srwi r6, r6, 1 +; P8LE-NEXT: add r3, r6, r3 +; P8LE-NEXT: clrldi r6, r4, 48 +; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: rldicl r4, r4, 32, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 26, 31 +; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: rlwinm r4, r4, 0, 29, 31 +; P8LE-NEXT: mtvsrd f0, r6 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v5, vs3 +; P8LE-NEXT: subf r3, r3, r5 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v3, v2 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_power_of_two: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 22765 +; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r7, r4, 16, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rlwinm r7, r7, 0, 26, 31 +; P8BE-NEXT: clrldi r6, r5, 32 +; P8BE-NEXT: mulld r3, r6, r3 +; P8BE-NEXT: rldicl r3, r3, 32, 32 +; P8BE-NEXT: subf r6, r3, r5 +; P8BE-NEXT: srwi r6, r6, 1 +; P8BE-NEXT: add r3, r6, r3 +; P8BE-NEXT: rldicl r6, r4, 32, 48 +; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: rldicl r4, r4, 48, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 27, 31 +; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: rlwinm r4, r4, 0, 29, 31 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: sldi r4, r4, 48 +; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: subf r3, r3, r5 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v4, r3 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { +; P9LE-LABEL: dont_fold_urem_one: +; P9LE: # %bb.0: +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: li r5, 0 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: oris r6, r5, 45590 +; P9LE-NEXT: oris r5, r5, 51306 +; P9LE-NEXT: ori r6, r6, 17097 +; P9LE-NEXT: ori r5, r5, 30865 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r6 +; P9LE-NEXT: lis r6, 24749 +; P9LE-NEXT: ori r6, r6, 47143 +; P9LE-NEXT: rldicl r4, r4, 28, 36 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 +; P9LE-NEXT: mulld r4, r4, r6 +; P9LE-NEXT: rldicl r4, r4, 21, 43 +; P9LE-NEXT: mulli r4, r4, 5423 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: vextuhrx r3, r3, v2 +; P9LE-NEXT: rlwinm r4, r3, 31, 17, 31 +; P9LE-NEXT: mulld r4, r4, r5 +; P9LE-NEXT: rldicl r4, r4, 24, 40 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: xxswapd v4, vs0 +; P9LE-NEXT: mtvsrd f0, r3 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: vmrglh v3, v4, v3 +; P9LE-NEXT: xxlxor v4, v4, v4 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_one: +; P9BE: # %bb.0: +; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: lis r5, 24749 +; P9BE-NEXT: ori r5, r5, 47143 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r5 +; P9BE-NEXT: li r5, 0 +; P9BE-NEXT: oris r6, r5, 45590 +; P9BE-NEXT: oris r5, r5, 51306 +; P9BE-NEXT: ori r6, r6, 17097 +; P9BE-NEXT: ori r5, r5, 30865 +; P9BE-NEXT: rldicl r4, r4, 21, 43 +; P9BE-NEXT: mulli r4, r4, 5423 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v3, r3 +; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: clrldi r4, r3, 32 +; P9BE-NEXT: mulld r4, r4, r6 +; P9BE-NEXT: rldicl r4, r4, 28, 36 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: clrlwi r4, r3, 16 +; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 +; P9BE-NEXT: mulld r3, r3, r5 +; P9BE-NEXT: rldicl r3, r3, 24, 40 +; P9BE-NEXT: mulli r3, r3, 654 +; P9BE-NEXT: subf r3, r3, r4 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 +; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: vmrghh v2, v4, v2 +; P9BE-NEXT: vmrghw v2, v2, v3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_one: +; P8LE: # %bb.0: +; P8LE-NEXT: xxswapd vs0, v2 +; P8LE-NEXT: li r3, 0 +; P8LE-NEXT: lis r8, 24749 +; P8LE-NEXT: xxlxor v5, v5, v5 +; P8LE-NEXT: oris r5, r3, 45590 +; P8LE-NEXT: ori r8, r8, 47143 +; P8LE-NEXT: oris r3, r3, 51306 +; P8LE-NEXT: ori r5, r5, 17097 +; P8LE-NEXT: ori r3, r3, 30865 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: rldicl r6, r4, 32, 48 +; P8LE-NEXT: rldicl r7, r4, 16, 48 +; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 +; P8LE-NEXT: rldicl r4, r4, 48, 48 +; P8LE-NEXT: mulld r5, r9, r5 +; P8LE-NEXT: rlwinm r9, r7, 0, 16, 31 +; P8LE-NEXT: mulld r8, r9, r8 +; P8LE-NEXT: rlwinm r9, r4, 31, 17, 31 +; P8LE-NEXT: mulld r3, r9, r3 +; P8LE-NEXT: rldicl r5, r5, 28, 36 +; P8LE-NEXT: rldicl r8, r8, 21, 43 +; P8LE-NEXT: mulli r5, r5, 23 +; P8LE-NEXT: rldicl r3, r3, 24, 40 +; P8LE-NEXT: mulli r8, r8, 5423 +; P8LE-NEXT: mulli r3, r3, 654 +; P8LE-NEXT: subf r5, r5, r6 +; P8LE-NEXT: subf r6, r8, r7 +; P8LE-NEXT: mtvsrd f0, r5 +; P8LE-NEXT: subf r3, r3, r4 +; P8LE-NEXT: mtvsrd f1, r6 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 +; P8LE-NEXT: vmrglh v2, v3, v2 +; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_one: +; P8BE: # %bb.0: +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: li r3, 0 +; P8BE-NEXT: lis r8, 24749 +; P8BE-NEXT: oris r6, r3, 51306 +; P8BE-NEXT: ori r8, r8, 47143 +; P8BE-NEXT: oris r3, r3, 45590 +; P8BE-NEXT: rldicl r5, r4, 32, 48 +; P8BE-NEXT: clrldi r7, r4, 48 +; P8BE-NEXT: ori r6, r6, 30865 +; P8BE-NEXT: ori r3, r3, 17097 +; P8BE-NEXT: rldicl r4, r4, 48, 48 +; P8BE-NEXT: rlwinm r9, r5, 31, 17, 31 +; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8BE-NEXT: mulld r6, r9, r6 +; P8BE-NEXT: clrldi r9, r7, 32 +; P8BE-NEXT: mulld r8, r9, r8 +; P8BE-NEXT: clrldi r9, r4, 32 +; P8BE-NEXT: mulld r3, r9, r3 +; P8BE-NEXT: li r9, 0 +; P8BE-NEXT: rldicl r6, r6, 24, 40 +; P8BE-NEXT: mulli r6, r6, 654 +; P8BE-NEXT: rldicl r8, r8, 21, 43 +; P8BE-NEXT: rldicl r3, r3, 28, 36 +; P8BE-NEXT: mulli r8, r8, 5423 +; P8BE-NEXT: mulli r3, r3, 23 +; P8BE-NEXT: subf r5, r6, r5 +; P8BE-NEXT: sldi r6, r9, 48 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: subf r6, r8, r7 +; P8BE-NEXT: mtvsrd v3, r5 +; P8BE-NEXT: subf r3, r3, r4 +; P8BE-NEXT: sldi r4, r6, 48 +; P8BE-NEXT: sldi r3, r3, 48 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v2, v3 +; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghw v2, v2, v3 +; P8BE-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: blr + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; P9LE-LABEL: dont_fold_urem_i64: +; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 25644 +; P9LE-NEXT: ori r4, r4, 34192 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 45590 +; P9LE-NEXT: mfvsrld r3, v3 +; P9LE-NEXT: ori r4, r4, 17097 +; P9LE-NEXT: mulhdu r4, r3, r4 +; P9LE-NEXT: sub r5, r3, r4 +; P9LE-NEXT: rldicl r5, r5, 63, 1 +; P9LE-NEXT: add r4, r5, r4 +; P9LE-NEXT: lis r5, -16037 +; P9LE-NEXT: rldicl r4, r4, 60, 4 +; P9LE-NEXT: ori r5, r5, 28749 +; P9LE-NEXT: mulli r4, r4, 23 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: oris r5, r5, 52170 +; P9LE-NEXT: ori r5, r5, 12109 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: mfvsrd r4, v3 +; P9LE-NEXT: mulhdu r5, r4, r5 +; P9LE-NEXT: rldicl r5, r5, 52, 12 +; P9LE-NEXT: mulli r5, r5, 5423 +; P9LE-NEXT: sub r4, r4, r5 +; P9LE-NEXT: lis r5, 25653 +; P9LE-NEXT: ori r5, r5, 15432 +; P9LE-NEXT: sldi r5, r5, 32 +; P9LE-NEXT: mtvsrdd v3, r4, r3 +; P9LE-NEXT: mfvsrd r3, v2 +; P9LE-NEXT: rldicl r4, r3, 63, 1 +; P9LE-NEXT: oris r5, r5, 1603 +; P9LE-NEXT: ori r5, r5, 21445 +; P9LE-NEXT: mulhdu r4, r4, r5 +; P9LE-NEXT: rldicl r4, r4, 57, 7 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: li r4, 0 +; P9LE-NEXT: mtvsrdd v2, r3, r4 +; P9LE-NEXT: blr +; +; P9BE-LABEL: dont_fold_urem_i64: +; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 25644 +; P9BE-NEXT: ori r4, r4, 34192 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 45590 +; P9BE-NEXT: mfvsrd r3, v3 +; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: mulhdu r4, r3, r4 +; P9BE-NEXT: sub r5, r3, r4 +; P9BE-NEXT: rldicl r5, r5, 63, 1 +; P9BE-NEXT: add r4, r5, r4 +; P9BE-NEXT: lis r5, -16037 +; P9BE-NEXT: rldicl r4, r4, 60, 4 +; P9BE-NEXT: mulli r4, r4, 23 +; P9BE-NEXT: ori r5, r5, 28749 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: oris r5, r5, 52170 +; P9BE-NEXT: ori r5, r5, 12109 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mfvsrld r4, v3 +; P9BE-NEXT: mulhdu r5, r4, r5 +; P9BE-NEXT: rldicl r5, r5, 52, 12 +; P9BE-NEXT: mulli r5, r5, 5423 +; P9BE-NEXT: sub r4, r4, r5 +; P9BE-NEXT: lis r5, 25653 +; P9BE-NEXT: ori r5, r5, 15432 +; P9BE-NEXT: sldi r5, r5, 32 +; P9BE-NEXT: mtvsrdd v3, r3, r4 +; P9BE-NEXT: mfvsrld r3, v2 +; P9BE-NEXT: rldicl r4, r3, 63, 1 +; P9BE-NEXT: oris r5, r5, 1603 +; P9BE-NEXT: ori r5, r5, 21445 +; P9BE-NEXT: mulhdu r4, r4, r5 +; P9BE-NEXT: rldicl r4, r4, 57, 7 +; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: mtvsrdd v2, 0, r3 +; P9BE-NEXT: blr +; +; P8LE-LABEL: dont_fold_urem_i64: +; P8LE: # %bb.0: +; P8LE-NEXT: lis r3, 25644 +; P8LE-NEXT: xxswapd vs0, v3 +; P8LE-NEXT: lis r4, -16037 +; P8LE-NEXT: lis r5, 25653 +; P8LE-NEXT: mfvsrd r6, v2 +; P8LE-NEXT: ori r3, r3, 34192 +; P8LE-NEXT: ori r4, r4, 28749 +; P8LE-NEXT: ori r5, r5, 15432 +; P8LE-NEXT: mfvsrd r8, v3 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: oris r3, r3, 45590 +; P8LE-NEXT: mfvsrd r7, f0 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: oris r4, r4, 52170 +; P8LE-NEXT: ori r3, r3, 17097 +; P8LE-NEXT: oris r5, r5, 1603 +; P8LE-NEXT: ori r4, r4, 12109 +; P8LE-NEXT: mulhdu r3, r7, r3 +; P8LE-NEXT: rldicl r9, r6, 63, 1 +; P8LE-NEXT: ori r5, r5, 21445 +; P8LE-NEXT: mulhdu r4, r8, r4 +; P8LE-NEXT: mulhdu r5, r9, r5 +; P8LE-NEXT: sub r9, r7, r3 +; P8LE-NEXT: rldicl r9, r9, 63, 1 +; P8LE-NEXT: rldicl r4, r4, 52, 12 +; P8LE-NEXT: add r3, r9, r3 +; P8LE-NEXT: rldicl r5, r5, 57, 7 +; P8LE-NEXT: mulli r4, r4, 5423 +; P8LE-NEXT: rldicl r3, r3, 60, 4 +; P8LE-NEXT: mulli r5, r5, 654 +; P8LE-NEXT: mulli r3, r3, 23 +; P8LE-NEXT: sub r4, r8, r4 +; P8LE-NEXT: sub r5, r6, r5 +; P8LE-NEXT: mtvsrd f0, r4 +; P8LE-NEXT: sub r3, r7, r3 +; P8LE-NEXT: li r4, 0 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: xxmrghd v3, vs0, vs2 +; P8LE-NEXT: xxmrghd v2, vs1, vs3 +; P8LE-NEXT: blr +; +; P8BE-LABEL: dont_fold_urem_i64: +; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 25644 +; P8BE-NEXT: lis r4, -16037 +; P8BE-NEXT: xxswapd vs0, v3 +; P8BE-NEXT: xxswapd vs1, v2 +; P8BE-NEXT: lis r5, 25653 +; P8BE-NEXT: ori r3, r3, 34192 +; P8BE-NEXT: ori r4, r4, 28749 +; P8BE-NEXT: mfvsrd r6, v3 +; P8BE-NEXT: ori r5, r5, 15432 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: sldi r4, r4, 32 +; P8BE-NEXT: oris r3, r3, 45590 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: mfvsrd r7, f0 +; P8BE-NEXT: oris r4, r4, 52170 +; P8BE-NEXT: ori r3, r3, 17097 +; P8BE-NEXT: mfvsrd r8, f1 +; P8BE-NEXT: oris r5, r5, 1603 +; P8BE-NEXT: ori r4, r4, 12109 +; P8BE-NEXT: mulhdu r3, r6, r3 +; P8BE-NEXT: ori r5, r5, 21445 +; P8BE-NEXT: mulhdu r4, r7, r4 +; P8BE-NEXT: rldicl r9, r8, 63, 1 +; P8BE-NEXT: mulhdu r5, r9, r5 +; P8BE-NEXT: sub r9, r6, r3 +; P8BE-NEXT: rldicl r9, r9, 63, 1 +; P8BE-NEXT: rldicl r4, r4, 52, 12 +; P8BE-NEXT: add r3, r9, r3 +; P8BE-NEXT: mulli r4, r4, 5423 +; P8BE-NEXT: rldicl r5, r5, 57, 7 +; P8BE-NEXT: rldicl r3, r3, 60, 4 +; P8BE-NEXT: mulli r5, r5, 654 +; P8BE-NEXT: mulli r3, r3, 23 +; P8BE-NEXT: sub r4, r7, r4 +; P8BE-NEXT: mtvsrd f0, r4 +; P8BE-NEXT: sub r4, r8, r5 +; P8BE-NEXT: sub r3, r6, r3 +; P8BE-NEXT: mtvsrd f1, r4 +; P8BE-NEXT: li r4, 0 +; P8BE-NEXT: mtvsrd f2, r3 +; P8BE-NEXT: mtvsrd f3, r4 +; P8BE-NEXT: xxmrghd v3, vs2, vs0 +; P8BE-NEXT: xxmrghd v2, vs3, vs1 +; P8BE-NEXT: blr + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} diff --git a/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll index 7839669bea9b13..3a41db0cb9809f 100644 --- a/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll +++ b/llvm/test/CodeGen/PowerPC/use-cr-result-of-dom-icmp-st.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O3 < %s | FileCheck %s -check-prefix=PPC64LE +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -O3 -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s ; Test cases are generated from: ; long long NAME(PARAM a, PARAM b) { @@ -13,22 +13,21 @@ ; for example. ll is PARAM, a_op_b (i.e., a << b) is LHS, _1 (i.e., -1) is RHS. target datalayout = "e-m:e-i64:64-n32:64" -target triple = "powerpc64le-unknown-linux-gnu" define i64 @ll_a_op_b__2(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_op_b__2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: sld 5, 3, 4 -; PPC64LE-NEXT: cmpdi 5, -2 -; PPC64LE-NEXT: ble 0, .LBB0_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB0_2: # %if.end -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_op_b__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sld r5, r3, r4 +; CHECK-NEXT: cmpdi r5, -2 +; CHECK-NEXT: ble cr0, .LBB0_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB0_2: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b %cmp = icmp sgt i64 %shl, -2 @@ -45,19 +44,19 @@ return: ; preds = %entry } define i64 @ll_a_op_b__1(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_op_b__1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: sld 5, 3, 4 -; PPC64LE-NEXT: cmpdi 5, -1 -; PPC64LE-NEXT: ble 0, .LBB1_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB1_2: # %if.end -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_op_b__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sld r5, r3, r4 +; CHECK-NEXT: cmpdi r5, -1 +; CHECK-NEXT: ble cr0, .LBB1_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB1_2: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b %cmp = icmp sgt i64 %shl, -1 @@ -74,19 +73,19 @@ return: ; preds = %entry } define i64 @ll_a_op_b_0(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_op_b_0: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: sld. 5, 3, 4 -; PPC64LE-NEXT: ble 0, .LBB2_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB2_2: # %if.end -; PPC64LE-NEXT: cmpldi 5, 0 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_op_b_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sld. r5, r3, r4 +; CHECK-NEXT: ble cr0, .LBB2_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB2_2: # %if.end +; CHECK-NEXT: cmpldi r5, 0 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b %cmp = icmp sgt i64 %shl, 0 @@ -103,20 +102,20 @@ return: ; preds = %entry } define i64 @ll_a_op_b_1(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_op_b_1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: sld 5, 3, 4 -; PPC64LE-NEXT: cmpdi 5, 1 -; PPC64LE-NEXT: ble 0, .LBB3_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB3_2: # %if.end -; PPC64LE-NEXT: cmpldi 5, 1 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_op_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sld r5, r3, r4 +; CHECK-NEXT: cmpdi r5, 1 +; CHECK-NEXT: ble cr0, .LBB3_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB3_2: # %if.end +; CHECK-NEXT: cmpldi r5, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b %cmp = icmp sgt i64 %shl, 1 @@ -133,20 +132,20 @@ return: ; preds = %entry } define i64 @ll_a_op_b_2(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_op_b_2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: sld 5, 3, 4 -; PPC64LE-NEXT: cmpdi 5, 2 -; PPC64LE-NEXT: ble 0, .LBB4_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB4_2: # %if.end -; PPC64LE-NEXT: cmpldi 5, 2 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_op_b_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sld r5, r3, r4 +; CHECK-NEXT: cmpdi r5, 2 +; CHECK-NEXT: ble cr0, .LBB4_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB4_2: # %if.end +; CHECK-NEXT: cmpldi r5, 2 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %shl = shl i64 %a, %b %cmp = icmp sgt i64 %shl, 2 @@ -163,18 +162,18 @@ return: ; preds = %entry } define i64 @ll_a__2(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a__2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: cmpdi 3, -2 -; PPC64LE-NEXT: ble 0, .LBB5_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB5_2: # %if.end -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpdi r3, -2 +; CHECK-NEXT: ble cr0, .LBB5_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB5_2: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, -2 br i1 %cmp, label %return, label %if.end @@ -190,18 +189,18 @@ return: ; preds = %entry } define i64 @ll_a__1(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a__1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: cmpdi 3, -1 -; PPC64LE-NEXT: ble 0, .LBB6_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB6_2: # %if.end -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpdi r3, -1 +; CHECK-NEXT: ble cr0, .LBB6_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB6_2: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, -1 br i1 %cmp, label %return, label %if.end @@ -217,19 +216,19 @@ return: ; preds = %entry } define i64 @ll_a_0(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_0: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: cmpdi 3, 0 -; PPC64LE-NEXT: ble 0, .LBB7_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB7_2: # %if.end -; PPC64LE-NEXT: cmpldi 3, 0 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpdi r3, 0 +; CHECK-NEXT: ble cr0, .LBB7_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB7_2: # %if.end +; CHECK-NEXT: cmpldi r3, 0 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, 0 br i1 %cmp, label %return, label %if.end @@ -245,19 +244,19 @@ return: ; preds = %entry } define i64 @ll_a_1(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: cmpdi 3, 1 -; PPC64LE-NEXT: ble 0, .LBB8_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB8_2: # %if.end -; PPC64LE-NEXT: cmpldi 3, 1 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpdi r3, 1 +; CHECK-NEXT: ble cr0, .LBB8_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB8_2: # %if.end +; CHECK-NEXT: cmpldi r3, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, 1 br i1 %cmp, label %return, label %if.end @@ -273,19 +272,19 @@ return: ; preds = %entry } define i64 @ll_a_2(i64 %a, i64 %b) { -; PPC64LE-LABEL: ll_a_2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: cmpdi 3, 2 -; PPC64LE-NEXT: ble 0, .LBB9_2 -; PPC64LE-NEXT: # %bb.1: # %return -; PPC64LE-NEXT: mr 3, 4 -; PPC64LE-NEXT: blr -; PPC64LE-NEXT: .LBB9_2: # %if.end -; PPC64LE-NEXT: cmpldi 3, 2 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: isel 4, 4, 5, 2 -; PPC64LE-NEXT: mulld 3, 4, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: ll_a_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpdi r3, 2 +; CHECK-NEXT: ble cr0, .LBB9_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: mr r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB9_2: # %if.end +; CHECK-NEXT: cmpldi r3, 2 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mulld r3, r4, r3 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i64 %a, 2 br i1 %cmp, label %return, label %if.end @@ -301,16 +300,18 @@ return: ; preds = %entry } define i64 @i_a_op_b__2(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_op_b__2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: slw 6, 3, 4 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmpwi 6, -2 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: mullw 3, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 3, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_op_b__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slw r5, r3, r4 +; CHECK-NEXT: cmpwi r5, -2 +; CHECK-NEXT: bgt cr0, .LBB10_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: .LBB10_2: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b %cmp = icmp sgt i32 %shl, -2 @@ -329,16 +330,20 @@ return: ; preds = %if.end, %entry } define i64 @i_a_op_b__1(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_op_b__1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: slw 6, 3, 4 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmpwi 6, -1 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: mullw 3, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 3, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_op_b__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slw r5, r3, r4 +; CHECK-NEXT: cmpwi r5, -1 +; CHECK-NEXT: ble cr0, .LBB11_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB11_2: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b %cmp = icmp sgt i32 %shl, -1 @@ -357,16 +362,21 @@ return: ; preds = %if.end, %entry } define i64 @i_a_op_b_0(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_op_b_0: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: slw. 5, 3, 4 -; PPC64LE-NEXT: li 6, 1 -; PPC64LE-NEXT: isel 6, 4, 6, 2 -; PPC64LE-NEXT: cmpwi 5, 0 -; PPC64LE-NEXT: mullw 3, 6, 3 -; PPC64LE-NEXT: isel 3, 4, 3, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_op_b_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slw r5, r3, r4 +; CHECK-NEXT: cmpwi r5, 0 +; CHECK-NEXT: ble cr0, .LBB12_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB12_2: # %if.end +; CHECK-NEXT: cmplwi r5, 0 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b %cmp = icmp sgt i32 %shl, 0 @@ -385,17 +395,19 @@ return: ; preds = %if.end, %entry } define i64 @i_a_op_b_1(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_op_b_1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: slw 6, 3, 4 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmplwi 6, 1 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: cmpwi 6, 1 -; PPC64LE-NEXT: mullw 3, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 3, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_op_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slw r5, r3, r4 +; CHECK-NEXT: cmpwi r5, 1 +; CHECK-NEXT: bgt cr0, .LBB13_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: cmplwi r5, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: .LBB13_2: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b %cmp = icmp sgt i32 %shl, 1 @@ -414,17 +426,19 @@ return: ; preds = %if.end, %entry } define i64 @i_a_op_b_2(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_op_b_2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: slw 6, 3, 4 -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmplwi 6, 2 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: cmpwi 6, 2 -; PPC64LE-NEXT: mullw 3, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 3, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_op_b_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slw r5, r3, r4 +; CHECK-NEXT: cmpwi r5, 2 +; CHECK-NEXT: bgt cr0, .LBB14_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: cmplwi r5, 2 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: .LBB14_2: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %shl = shl i32 %a, %b %cmp = icmp sgt i32 %shl, 2 @@ -443,15 +457,17 @@ return: ; preds = %if.end, %entry } define i64 @i_a__2(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a__2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmpwi 3, -2 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: mullw 3, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 3, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r3, -2 +; CHECK-NEXT: bgt cr0, .LBB15_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: .LBB15_2: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, -2 br i1 %cmp, label %return, label %if.end @@ -469,15 +485,19 @@ return: ; preds = %if.end, %entry } define i64 @i_a__1(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a__1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmpwi 3, -1 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: mullw 3, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 3, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r3, -1 +; CHECK-NEXT: ble cr0, .LBB16_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB16_2: # %if.end +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, -1 br i1 %cmp, label %return, label %if.end @@ -495,16 +515,20 @@ return: ; preds = %if.end, %entry } define i64 @i_a_0(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_0: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmplwi 3, 0 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: cmpwi 0, 3, 0 -; PPC64LE-NEXT: mullw 5, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 5, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r3, 0 +; CHECK-NEXT: ble cr0, .LBB17_2 +; CHECK-NEXT: # %bb.1: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr +; CHECK-NEXT: .LBB17_2: # %if.end +; CHECK-NEXT: cmplwi r3, 0 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, 0 br i1 %cmp, label %return, label %if.end @@ -522,16 +546,18 @@ return: ; preds = %if.end, %entry } define i64 @i_a_1(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_1: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmplwi 3, 1 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: cmpwi 0, 3, 1 -; PPC64LE-NEXT: mullw 5, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 5, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r3, 1 +; CHECK-NEXT: bgt cr0, .LBB18_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: cmplwi r3, 1 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: .LBB18_2: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, 1 br i1 %cmp, label %return, label %if.end @@ -549,16 +575,18 @@ return: ; preds = %if.end, %entry } define i64 @i_a_2(i32 signext %a, i32 signext %b) { -; PPC64LE-LABEL: i_a_2: -; PPC64LE: # %bb.0: # %entry -; PPC64LE-NEXT: li 5, 1 -; PPC64LE-NEXT: cmplwi 3, 2 -; PPC64LE-NEXT: isel 5, 4, 5, 2 -; PPC64LE-NEXT: cmpwi 0, 3, 2 -; PPC64LE-NEXT: mullw 5, 5, 3 -; PPC64LE-NEXT: isel 3, 4, 5, 1 -; PPC64LE-NEXT: extsw 3, 3 -; PPC64LE-NEXT: blr +; CHECK-LABEL: i_a_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r3, 2 +; CHECK-NEXT: bgt cr0, .LBB19_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: cmplwi r3, 2 +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: isel r4, r4, r5, eq +; CHECK-NEXT: mullw r4, r4, r3 +; CHECK-NEXT: .LBB19_2: # %return +; CHECK-NEXT: extsw r3, r4 +; CHECK-NEXT: blr entry: %cmp = icmp sgt i32 %a, 2 br i1 %cmp, label %return, label %if.end diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll index ac939583f8ff14..0b4defcd88a4e3 100644 --- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll @@ -2,7 +2,7 @@ ; RUN: llc -O3 -mtriple=powerpc64le-linux-gnu < %s | FileCheck --check-prefix=PC64LE %s ; RUN: llc -O3 -mtriple=powerpc64le-linux-gnu -mcpu=pwr9 < %s | FileCheck --check-prefix=PC64LE9 %s -define <1 x float> @constrained_vector_fdiv_v1f32() nounwind { +define <1 x float> @constrained_vector_fdiv_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_fdiv_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI0_0@toc@ha @@ -29,11 +29,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %div } -define <2 x double> @constrained_vector_fdiv_v2f64() nounwind { +define <2 x double> @constrained_vector_fdiv_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_fdiv_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI1_0@toc@ha @@ -62,11 +62,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %div } -define <3 x float> @constrained_vector_fdiv_v3f32() nounwind { +define <3 x float> @constrained_vector_fdiv_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_fdiv_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI2_0@toc@ha @@ -123,11 +123,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %div } -define <3 x double> @constrained_vector_fdiv_v3f64() nounwind { +define <3 x double> @constrained_vector_fdiv_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_fdiv_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI3_2@toc@ha @@ -172,11 +172,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %div } -define <4 x double> @constrained_vector_fdiv_v4f64() nounwind { +define <4 x double> @constrained_vector_fdiv_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_fdiv_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI4_0@toc@ha @@ -216,11 +216,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %div } -define <1 x float> @constrained_vector_frem_v1f32() nounwind { +define <1 x float> @constrained_vector_frem_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_frem_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -261,11 +261,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %rem } -define <2 x double> @constrained_vector_frem_v2f64() nounwind { +define <2 x double> @constrained_vector_frem_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_frem_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -330,11 +330,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %rem } -define <3 x float> @constrained_vector_frem_v3f32() nounwind { +define <3 x float> @constrained_vector_frem_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_frem_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -431,11 +431,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %rem } -define <3 x double> @constrained_vector_frem_v3f64() nounwind { +define <3 x double> @constrained_vector_frem_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_frem_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -522,11 +522,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %rem } -define <4 x double> @constrained_vector_frem_v4f64() nounwind { +define <4 x double> @constrained_vector_frem_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_frem_v4f64: ; PC64LE: # %bb.0: ; PC64LE-NEXT: mflr 0 @@ -632,11 +632,11 @@ define <4 x double> @constrained_vector_frem_v4f64() nounwind { <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %rem } -define <1 x float> @constrained_vector_fmul_v1f32() nounwind { +define <1 x float> @constrained_vector_fmul_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_fmul_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI10_0@toc@ha @@ -663,11 +663,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %mul } -define <2 x double> @constrained_vector_fmul_v2f64() nounwind { +define <2 x double> @constrained_vector_fmul_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_fmul_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI11_0@toc@ha @@ -696,11 +696,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %mul } -define <3 x float> @constrained_vector_fmul_v3f32() nounwind { +define <3 x float> @constrained_vector_fmul_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_fmul_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI12_1@toc@ha @@ -758,11 +758,11 @@ entry: float 0x7FF0000000000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %mul } -define <3 x double> @constrained_vector_fmul_v3f64() nounwind { +define <3 x double> @constrained_vector_fmul_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_fmul_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI13_2@toc@ha @@ -808,11 +808,11 @@ entry: double 0x7FEFFFFFFFFFFFFF>, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %mul } -define <4 x double> @constrained_vector_fmul_v4f64() nounwind { +define <4 x double> @constrained_vector_fmul_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_fmul_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI14_0@toc@ha @@ -852,11 +852,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %mul } -define <1 x float> @constrained_vector_fadd_v1f32() nounwind { +define <1 x float> @constrained_vector_fadd_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_fadd_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI15_0@toc@ha @@ -883,11 +883,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %add } -define <2 x double> @constrained_vector_fadd_v2f64() nounwind { +define <2 x double> @constrained_vector_fadd_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_fadd_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI16_0@toc@ha @@ -916,11 +916,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %add } -define <3 x float> @constrained_vector_fadd_v3f32() nounwind { +define <3 x float> @constrained_vector_fadd_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_fadd_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI17_0@toc@ha @@ -976,11 +976,11 @@ entry: float 0xFFFFFFFFE0000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %add } -define <3 x double> @constrained_vector_fadd_v3f64() nounwind { +define <3 x double> @constrained_vector_fadd_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_fadd_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI18_1@toc@ha @@ -1024,11 +1024,11 @@ entry: double 0x7FEFFFFFFFFFFFFF>, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %add } -define <4 x double> @constrained_vector_fadd_v4f64() nounwind { +define <4 x double> @constrained_vector_fadd_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_fadd_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI19_0@toc@ha @@ -1068,11 +1068,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %add } -define <1 x float> @constrained_vector_fsub_v1f32() nounwind { +define <1 x float> @constrained_vector_fsub_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_fsub_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI20_0@toc@ha @@ -1099,11 +1099,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %sub } -define <2 x double> @constrained_vector_fsub_v2f64() nounwind { +define <2 x double> @constrained_vector_fsub_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_fsub_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI21_0@toc@ha @@ -1132,11 +1132,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %sub } -define <3 x float> @constrained_vector_fsub_v3f32() nounwind { +define <3 x float> @constrained_vector_fsub_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_fsub_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI22_0@toc@ha @@ -1192,11 +1192,11 @@ entry: float 0xFFFFFFFFE0000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %sub } -define <3 x double> @constrained_vector_fsub_v3f64() nounwind { +define <3 x double> @constrained_vector_fsub_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_fsub_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI23_1@toc@ha @@ -1240,11 +1240,11 @@ entry: double 0xFFEFFFFFFFFFFFFF>, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %sub } -define <4 x double> @constrained_vector_fsub_v4f64() nounwind { +define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_fsub_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI24_0@toc@ha @@ -1284,11 +1284,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %sub } -define <1 x float> @constrained_vector_sqrt_v1f32() nounwind { +define <1 x float> @constrained_vector_sqrt_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_sqrt_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI25_0@toc@ha @@ -1310,11 +1310,11 @@ entry: %sqrt = call <1 x float> @llvm.experimental.constrained.sqrt.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %sqrt } -define <2 x double> @constrained_vector_sqrt_v2f64() nounwind { +define <2 x double> @constrained_vector_sqrt_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_sqrt_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI26_0@toc@ha @@ -1335,11 +1335,11 @@ entry: %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %sqrt } -define <3 x float> @constrained_vector_sqrt_v3f32() nounwind { +define <3 x float> @constrained_vector_sqrt_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_sqrt_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI27_2@toc@ha @@ -1391,11 +1391,11 @@ entry: %sqrt = call <3 x float> @llvm.experimental.constrained.sqrt.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %sqrt } -define <3 x double> @constrained_vector_sqrt_v3f64() nounwind { +define <3 x double> @constrained_vector_sqrt_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_sqrt_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI28_1@toc@ha @@ -1428,11 +1428,11 @@ entry: %sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %sqrt } -define <4 x double> @constrained_vector_sqrt_v4f64() nounwind { +define <4 x double> @constrained_vector_sqrt_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_sqrt_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI29_0@toc@ha @@ -1463,11 +1463,11 @@ define <4 x double> @constrained_vector_sqrt_v4f64() nounwind { <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %sqrt } -define <1 x float> @constrained_vector_pow_v1f32() nounwind { +define <1 x float> @constrained_vector_pow_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_pow_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -1508,11 +1508,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %pow } -define <2 x double> @constrained_vector_pow_v2f64() nounwind { +define <2 x double> @constrained_vector_pow_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_pow_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -1577,11 +1577,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %pow } -define <3 x float> @constrained_vector_pow_v3f32() nounwind { +define <3 x float> @constrained_vector_pow_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_pow_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -1678,11 +1678,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %pow } -define <3 x double> @constrained_vector_pow_v3f64() nounwind { +define <3 x double> @constrained_vector_pow_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_pow_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -1769,11 +1769,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %pow } -define <4 x double> @constrained_vector_pow_v4f64() nounwind { +define <4 x double> @constrained_vector_pow_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_pow_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -1880,11 +1880,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %pow } -define <1 x float> @constrained_vector_powi_v1f32() nounwind { +define <1 x float> @constrained_vector_powi_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_powi_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -1923,11 +1923,11 @@ entry: <1 x float> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %powi } -define <2 x double> @constrained_vector_powi_v2f64() nounwind { +define <2 x double> @constrained_vector_powi_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_powi_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -1984,11 +1984,11 @@ entry: <2 x double> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %powi } -define <3 x float> @constrained_vector_powi_v3f32() nounwind { +define <3 x float> @constrained_vector_powi_v3f32() #0 { ; ; ; PC64LE-LABEL: constrained_vector_powi_v3f32: @@ -2079,11 +2079,11 @@ entry: <3 x float> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %powi } -define <3 x double> @constrained_vector_powi_v3f64() nounwind { +define <3 x double> @constrained_vector_powi_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_powi_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2162,11 +2162,11 @@ entry: <3 x double> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %powi } -define <4 x double> @constrained_vector_powi_v4f64() nounwind { +define <4 x double> @constrained_vector_powi_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_powi_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2264,11 +2264,11 @@ entry: double 42.3, double 42.4>, i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %powi } -define <1 x float> @constrained_vector_sin_v1f32() nounwind { +define <1 x float> @constrained_vector_sin_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_sin_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2304,11 +2304,11 @@ entry: %sin = call <1 x float> @llvm.experimental.constrained.sin.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %sin } -define <2 x double> @constrained_vector_sin_v2f64() nounwind { +define <2 x double> @constrained_vector_sin_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_sin_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2360,11 +2360,11 @@ entry: %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %sin } -define <3 x float> @constrained_vector_sin_v3f32() nounwind { +define <3 x float> @constrained_vector_sin_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_sin_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2446,11 +2446,11 @@ entry: %sin = call <3 x float> @llvm.experimental.constrained.sin.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %sin } -define <3 x double> @constrained_vector_sin_v3f64() nounwind { +define <3 x double> @constrained_vector_sin_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_sin_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2522,11 +2522,11 @@ entry: %sin = call <3 x double> @llvm.experimental.constrained.sin.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %sin } -define <4 x double> @constrained_vector_sin_v4f64() nounwind { +define <4 x double> @constrained_vector_sin_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_sin_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2615,11 +2615,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %sin } -define <1 x float> @constrained_vector_cos_v1f32() nounwind { +define <1 x float> @constrained_vector_cos_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_cos_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2655,11 +2655,11 @@ entry: %cos = call <1 x float> @llvm.experimental.constrained.cos.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %cos } -define <2 x double> @constrained_vector_cos_v2f64() nounwind { +define <2 x double> @constrained_vector_cos_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_cos_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2711,11 +2711,11 @@ entry: %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %cos } -define <3 x float> @constrained_vector_cos_v3f32() nounwind { +define <3 x float> @constrained_vector_cos_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_cos_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2797,11 +2797,11 @@ entry: %cos = call <3 x float> @llvm.experimental.constrained.cos.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %cos } -define <3 x double> @constrained_vector_cos_v3f64() nounwind { +define <3 x double> @constrained_vector_cos_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_cos_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2873,11 +2873,11 @@ entry: %cos = call <3 x double> @llvm.experimental.constrained.cos.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %cos } -define <4 x double> @constrained_vector_cos_v4f64() nounwind { +define <4 x double> @constrained_vector_cos_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_cos_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -2966,11 +2966,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %cos } -define <1 x float> @constrained_vector_exp_v1f32() nounwind { +define <1 x float> @constrained_vector_exp_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_exp_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3006,11 +3006,11 @@ entry: %exp = call <1 x float> @llvm.experimental.constrained.exp.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %exp } -define <2 x double> @constrained_vector_exp_v2f64() nounwind { +define <2 x double> @constrained_vector_exp_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_exp_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3062,11 +3062,11 @@ entry: %exp = call <2 x double> @llvm.experimental.constrained.exp.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %exp } -define <3 x float> @constrained_vector_exp_v3f32() nounwind { +define <3 x float> @constrained_vector_exp_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_exp_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3148,11 +3148,11 @@ entry: %exp = call <3 x float> @llvm.experimental.constrained.exp.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %exp } -define <3 x double> @constrained_vector_exp_v3f64() nounwind { +define <3 x double> @constrained_vector_exp_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_exp_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3224,11 +3224,11 @@ entry: %exp = call <3 x double> @llvm.experimental.constrained.exp.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %exp } -define <4 x double> @constrained_vector_exp_v4f64() nounwind { +define <4 x double> @constrained_vector_exp_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_exp_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3317,11 +3317,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %exp } -define <1 x float> @constrained_vector_exp2_v1f32() nounwind { +define <1 x float> @constrained_vector_exp2_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_exp2_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3357,11 +3357,11 @@ entry: %exp2 = call <1 x float> @llvm.experimental.constrained.exp2.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %exp2 } -define <2 x double> @constrained_vector_exp2_v2f64() nounwind { +define <2 x double> @constrained_vector_exp2_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_exp2_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3413,11 +3413,11 @@ entry: %exp2 = call <2 x double> @llvm.experimental.constrained.exp2.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %exp2 } -define <3 x float> @constrained_vector_exp2_v3f32() nounwind { +define <3 x float> @constrained_vector_exp2_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_exp2_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3499,11 +3499,11 @@ entry: %exp2 = call <3 x float> @llvm.experimental.constrained.exp2.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %exp2 } -define <3 x double> @constrained_vector_exp2_v3f64() nounwind { +define <3 x double> @constrained_vector_exp2_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_exp2_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3575,11 +3575,11 @@ entry: %exp2 = call <3 x double> @llvm.experimental.constrained.exp2.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %exp2 } -define <4 x double> @constrained_vector_exp2_v4f64() nounwind { +define <4 x double> @constrained_vector_exp2_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_exp2_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3668,11 +3668,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %exp2 } -define <1 x float> @constrained_vector_log_v1f32() nounwind { +define <1 x float> @constrained_vector_log_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_log_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3708,11 +3708,11 @@ entry: %log = call <1 x float> @llvm.experimental.constrained.log.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %log } -define <2 x double> @constrained_vector_log_v2f64() nounwind { +define <2 x double> @constrained_vector_log_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_log_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3764,11 +3764,11 @@ entry: %log = call <2 x double> @llvm.experimental.constrained.log.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %log } -define <3 x float> @constrained_vector_log_v3f32() nounwind { +define <3 x float> @constrained_vector_log_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_log_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3850,11 +3850,11 @@ entry: %log = call <3 x float> @llvm.experimental.constrained.log.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %log } -define <3 x double> @constrained_vector_log_v3f64() nounwind { +define <3 x double> @constrained_vector_log_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_log_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -3926,11 +3926,11 @@ entry: %log = call <3 x double> @llvm.experimental.constrained.log.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %log } -define <4 x double> @constrained_vector_log_v4f64() nounwind { +define <4 x double> @constrained_vector_log_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_log_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4019,11 +4019,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %log } -define <1 x float> @constrained_vector_log10_v1f32() nounwind { +define <1 x float> @constrained_vector_log10_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_log10_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4059,11 +4059,11 @@ entry: %log10 = call <1 x float> @llvm.experimental.constrained.log10.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %log10 } -define <2 x double> @constrained_vector_log10_v2f64() nounwind { +define <2 x double> @constrained_vector_log10_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_log10_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4115,11 +4115,11 @@ entry: %log10 = call <2 x double> @llvm.experimental.constrained.log10.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %log10 } -define <3 x float> @constrained_vector_log10_v3f32() nounwind { +define <3 x float> @constrained_vector_log10_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_log10_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4201,11 +4201,11 @@ entry: %log10 = call <3 x float> @llvm.experimental.constrained.log10.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %log10 } -define <3 x double> @constrained_vector_log10_v3f64() nounwind { +define <3 x double> @constrained_vector_log10_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_log10_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4277,11 +4277,11 @@ entry: %log10 = call <3 x double> @llvm.experimental.constrained.log10.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %log10 } -define <4 x double> @constrained_vector_log10_v4f64() nounwind { +define <4 x double> @constrained_vector_log10_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_log10_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4370,11 +4370,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %log10 } -define <1 x float> @constrained_vector_log2_v1f32() nounwind { +define <1 x float> @constrained_vector_log2_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_log2_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4410,11 +4410,11 @@ entry: %log2 = call <1 x float> @llvm.experimental.constrained.log2.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %log2 } -define <2 x double> @constrained_vector_log2_v2f64() nounwind { +define <2 x double> @constrained_vector_log2_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_log2_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4466,11 +4466,11 @@ entry: %log2 = call <2 x double> @llvm.experimental.constrained.log2.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %log2 } -define <3 x float> @constrained_vector_log2_v3f32() nounwind { +define <3 x float> @constrained_vector_log2_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_log2_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4552,11 +4552,11 @@ entry: %log2 = call <3 x float> @llvm.experimental.constrained.log2.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %log2 } -define <3 x double> @constrained_vector_log2_v3f64() nounwind { +define <3 x double> @constrained_vector_log2_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_log2_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4628,11 +4628,11 @@ entry: %log2 = call <3 x double> @llvm.experimental.constrained.log2.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %log2 } -define <4 x double> @constrained_vector_log2_v4f64() nounwind { +define <4 x double> @constrained_vector_log2_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_log2_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4721,11 +4721,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %log2 } -define <1 x float> @constrained_vector_rint_v1f32() nounwind { +define <1 x float> @constrained_vector_rint_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_rint_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4761,11 +4761,11 @@ entry: %rint = call <1 x float> @llvm.experimental.constrained.rint.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %rint } -define <2 x double> @constrained_vector_rint_v2f64() nounwind { +define <2 x double> @constrained_vector_rint_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_rint_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4817,11 +4817,11 @@ entry: %rint = call <2 x double> @llvm.experimental.constrained.rint.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %rint } -define <3 x float> @constrained_vector_rint_v3f32() nounwind { +define <3 x float> @constrained_vector_rint_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_rint_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4903,11 +4903,11 @@ define <3 x float> @constrained_vector_rint_v3f32() nounwind { %rint = call <3 x float> @llvm.experimental.constrained.rint.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %rint } -define <3 x double> @constrained_vector_rint_v3f64() nounwind { +define <3 x double> @constrained_vector_rint_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_rint_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -4979,11 +4979,11 @@ entry: %rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %rint } -define <4 x double> @constrained_vector_rint_v4f64() nounwind { +define <4 x double> @constrained_vector_rint_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_rint_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5072,11 +5072,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %rint } -define <1 x float> @constrained_vector_nearbyint_v1f32() nounwind { +define <1 x float> @constrained_vector_nearbyint_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_nearbyint_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5112,11 +5112,11 @@ entry: %nearby = call <1 x float> @llvm.experimental.constrained.nearbyint.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %nearby } -define <2 x double> @constrained_vector_nearbyint_v2f64() nounwind { +define <2 x double> @constrained_vector_nearbyint_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_nearbyint_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5168,11 +5168,11 @@ entry: %nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %nearby } -define <3 x float> @constrained_vector_nearbyint_v3f32() nounwind { +define <3 x float> @constrained_vector_nearbyint_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_nearbyint_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5254,11 +5254,11 @@ entry: %nearby = call <3 x float> @llvm.experimental.constrained.nearbyint.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %nearby } -define <3 x double> @constrained_vector_nearby_v3f64() nounwind { +define <3 x double> @constrained_vector_nearby_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_nearby_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5330,11 +5330,11 @@ entry: %nearby = call <3 x double> @llvm.experimental.constrained.nearbyint.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %nearby } -define <4 x double> @constrained_vector_nearbyint_v4f64() nounwind { +define <4 x double> @constrained_vector_nearbyint_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_nearbyint_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5423,11 +5423,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %nearby } -define <1 x float> @constrained_vector_maxnum_v1f32() nounwind { +define <1 x float> @constrained_vector_maxnum_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_maxnum_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5467,11 +5467,11 @@ entry: %max = call <1 x float> @llvm.experimental.constrained.maxnum.v1f32( <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %max } -define <2 x double> @constrained_vector_maxnum_v2f64() nounwind { +define <2 x double> @constrained_vector_maxnum_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_maxnum_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5532,11 +5532,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %max } -define <3 x float> @constrained_vector_maxnum_v3f32() nounwind { +define <3 x float> @constrained_vector_maxnum_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_maxnum_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5635,11 +5635,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %max } -define <3 x double> @constrained_vector_max_v3f64() nounwind { +define <3 x double> @constrained_vector_max_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_max_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5724,11 +5724,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %max } -define <4 x double> @constrained_vector_maxnum_v4f64() nounwind { +define <4 x double> @constrained_vector_maxnum_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_maxnum_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5835,11 +5835,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %max } -define <1 x float> @constrained_vector_minnum_v1f32() nounwind { +define <1 x float> @constrained_vector_minnum_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_minnum_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5879,11 +5879,11 @@ define <1 x float> @constrained_vector_minnum_v1f32() nounwind { %min = call <1 x float> @llvm.experimental.constrained.minnum.v1f32( <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %min } -define <2 x double> @constrained_vector_minnum_v2f64() nounwind { +define <2 x double> @constrained_vector_minnum_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_minnum_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -5944,11 +5944,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %min } -define <3 x float> @constrained_vector_minnum_v3f32() nounwind { +define <3 x float> @constrained_vector_minnum_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_minnum_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -6047,11 +6047,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %min } -define <3 x double> @constrained_vector_min_v3f64() nounwind { +define <3 x double> @constrained_vector_min_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_min_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -6136,11 +6136,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %min } -define <4 x double> @constrained_vector_minnum_v4f64() nounwind { +define <4 x double> @constrained_vector_minnum_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_minnum_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: mflr 0 @@ -6247,11 +6247,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %min } -define <1 x float> @constrained_vector_fptrunc_v1f64() nounwind { +define <1 x float> @constrained_vector_fptrunc_v1f64() #0 { ; PC64LE-LABEL: constrained_vector_fptrunc_v1f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI95_0@toc@ha @@ -6273,11 +6273,11 @@ entry: %result = call <1 x float> @llvm.experimental.constrained.fptrunc.v1f32.v1f64( <1 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %result } -define <2 x float> @constrained_vector_fptrunc_v2f64() nounwind { +define <2 x float> @constrained_vector_fptrunc_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_fptrunc_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI96_0@toc@ha @@ -6311,11 +6311,11 @@ entry: %result = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64( <2 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x float> %result } -define <3 x float> @constrained_vector_fptrunc_v3f64() nounwind { +define <3 x float> @constrained_vector_fptrunc_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_fptrunc_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI97_0@toc@ha @@ -6368,11 +6368,11 @@ entry: <3 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %result } -define <4 x float> @constrained_vector_fptrunc_v4f64() nounwind { +define <4 x float> @constrained_vector_fptrunc_v4f64() #0 { ; PC64LE-LABEL: constrained_vector_fptrunc_v4f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI98_0@toc@ha @@ -6411,11 +6411,11 @@ entry: <4 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x float> %result } -define <1 x double> @constrained_vector_fpext_v1f32() nounwind { +define <1 x double> @constrained_vector_fpext_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_fpext_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI99_0@toc@ha @@ -6432,11 +6432,11 @@ define <1 x double> @constrained_vector_fpext_v1f32() nounwind { entry: %result = call <1 x double> @llvm.experimental.constrained.fpext.v1f64.v1f32( <1 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x double> %result } -define <2 x double> @constrained_vector_fpext_v2f32() nounwind { +define <2 x double> @constrained_vector_fpext_v2f32() #0 { ; PC64LE-LABEL: constrained_vector_fpext_v2f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI100_0@toc@ha @@ -6457,11 +6457,11 @@ define <2 x double> @constrained_vector_fpext_v2f32() nounwind { entry: %result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32( <2 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %result } -define <3 x double> @constrained_vector_fpext_v3f32() nounwind { +define <3 x double> @constrained_vector_fpext_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_fpext_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI101_0@toc@ha @@ -6485,11 +6485,11 @@ entry: %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32( <3 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %result } -define <4 x double> @constrained_vector_fpext_v4f32() nounwind { +define <4 x double> @constrained_vector_fpext_v4f32() #0 { ; PC64LE-LABEL: constrained_vector_fpext_v4f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI102_0@toc@ha @@ -6521,11 +6521,11 @@ entry: %result = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32( <4 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <4 x double> %result } -define <1 x float> @constrained_vector_ceil_v1f32() nounwind { +define <1 x float> @constrained_vector_ceil_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_ceil_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI103_0@toc@ha @@ -6547,11 +6547,11 @@ entry: %ceil = call <1 x float> @llvm.experimental.constrained.ceil.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %ceil } -define <2 x double> @constrained_vector_ceil_v2f64() nounwind { +define <2 x double> @constrained_vector_ceil_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_ceil_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI104_0@toc@ha @@ -6572,11 +6572,11 @@ entry: %ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %ceil } -define <3 x float> @constrained_vector_ceil_v3f32() nounwind { +define <3 x float> @constrained_vector_ceil_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_ceil_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI105_2@toc@ha @@ -6628,11 +6628,11 @@ entry: %ceil = call <3 x float> @llvm.experimental.constrained.ceil.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %ceil } -define <3 x double> @constrained_vector_ceil_v3f64() nounwind { +define <3 x double> @constrained_vector_ceil_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_ceil_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI106_1@toc@ha @@ -6665,11 +6665,11 @@ entry: %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %ceil } -define <1 x float> @constrained_vector_floor_v1f32() nounwind { +define <1 x float> @constrained_vector_floor_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_floor_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI107_0@toc@ha @@ -6691,12 +6691,12 @@ entry: %floor = call <1 x float> @llvm.experimental.constrained.floor.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %floor } -define <2 x double> @constrained_vector_floor_v2f64() nounwind { +define <2 x double> @constrained_vector_floor_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_floor_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI108_0@toc@ha @@ -6717,11 +6717,11 @@ entry: %floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %floor } -define <3 x float> @constrained_vector_floor_v3f32() nounwind { +define <3 x float> @constrained_vector_floor_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_floor_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI109_2@toc@ha @@ -6773,11 +6773,11 @@ entry: %floor = call <3 x float> @llvm.experimental.constrained.floor.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %floor } -define <3 x double> @constrained_vector_floor_v3f64() nounwind { +define <3 x double> @constrained_vector_floor_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_floor_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI110_1@toc@ha @@ -6810,11 +6810,11 @@ entry: %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %floor } -define <1 x float> @constrained_vector_round_v1f32() nounwind { +define <1 x float> @constrained_vector_round_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_round_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI111_0@toc@ha @@ -6836,11 +6836,11 @@ entry: %round = call <1 x float> @llvm.experimental.constrained.round.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %round } -define <2 x double> @constrained_vector_round_v2f64() nounwind { +define <2 x double> @constrained_vector_round_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_round_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI112_0@toc@ha @@ -6861,11 +6861,11 @@ entry: %round = call <2 x double> @llvm.experimental.constrained.round.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %round } -define <3 x float> @constrained_vector_round_v3f32() nounwind { +define <3 x float> @constrained_vector_round_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_round_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI113_2@toc@ha @@ -6917,12 +6917,12 @@ entry: %round = call <3 x float> @llvm.experimental.constrained.round.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %round } -define <3 x double> @constrained_vector_round_v3f64() nounwind { +define <3 x double> @constrained_vector_round_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_round_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI114_1@toc@ha @@ -6955,11 +6955,11 @@ entry: %round = call <3 x double> @llvm.experimental.constrained.round.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %round } -define <1 x float> @constrained_vector_trunc_v1f32() nounwind { +define <1 x float> @constrained_vector_trunc_v1f32() #0 { ; PC64LE-LABEL: constrained_vector_trunc_v1f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI115_0@toc@ha @@ -6981,11 +6981,11 @@ entry: %trunc = call <1 x float> @llvm.experimental.constrained.trunc.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <1 x float> %trunc } -define <2 x double> @constrained_vector_trunc_v2f64() nounwind { +define <2 x double> @constrained_vector_trunc_v2f64() #0 { ; PC64LE-LABEL: constrained_vector_trunc_v2f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI116_0@toc@ha @@ -7006,11 +7006,11 @@ entry: %trunc = call <2 x double> @llvm.experimental.constrained.trunc.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <2 x double> %trunc } -define <3 x float> @constrained_vector_trunc_v3f32() nounwind { +define <3 x float> @constrained_vector_trunc_v3f32() #0 { ; PC64LE-LABEL: constrained_vector_trunc_v3f32: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI117_2@toc@ha @@ -7062,11 +7062,11 @@ entry: %trunc = call <3 x float> @llvm.experimental.constrained.trunc.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x float> %trunc } -define <3 x double> @constrained_vector_trunc_v3f64() nounwind { +define <3 x double> @constrained_vector_trunc_v3f64() #0 { ; PC64LE-LABEL: constrained_vector_trunc_v3f64: ; PC64LE: # %bb.0: # %entry ; PC64LE-NEXT: addis 3, 2, .LCPI118_1@toc@ha @@ -7099,10 +7099,12 @@ entry: %trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #1 ret <3 x double> %trunc } +attributes #0 = { nounwind strictfp noimplicitfloat } +attributes #1 = { strictfp } ; Single width declarations declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/large-stack.ll b/llvm/test/CodeGen/RISCV/large-stack.ll index f4149712231847..48e09eb66dad03 100644 --- a/llvm/test/CodeGen/RISCV/large-stack.ll +++ b/llvm/test/CodeGen/RISCV/large-stack.ll @@ -6,44 +6,44 @@ ; TODO: the quality of the generated code is poor -define void @test() nounwind { +define void @test() { ; RV32I-FPELIM-LABEL: test: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: lui a0, 74565 ; RV32I-FPELIM-NEXT: addi a0, a0, 1664 ; RV32I-FPELIM-NEXT: sub sp, sp, a0 +; RV32I-FPELIM-NEXT: .cfi_def_cfa_offset 305419904 ; RV32I-FPELIM-NEXT: lui a0, 74565 ; RV32I-FPELIM-NEXT: addi a0, a0, 1664 ; RV32I-FPELIM-NEXT: add sp, sp, a0 +; RV32I-FPELIM-NEXT: .cfi_def_cfa_offset 0 ; RV32I-FPELIM-NEXT: ret ; ; RV32I-WITHFP-LABEL: test: ; RV32I-WITHFP: # %bb.0: +; RV32I-WITHFP-NEXT: addi sp, sp, -2032 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-WITHFP-NEXT: sw ra, 2028(sp) +; RV32I-WITHFP-NEXT: sw s0, 2024(sp) +; RV32I-WITHFP-NEXT: .cfi_offset ra, -4 +; RV32I-WITHFP-NEXT: .cfi_offset s0, -8 +; RV32I-WITHFP-NEXT: addi s0, sp, 2032 +; RV32I-WITHFP-NEXT: .cfi_def_cfa s0, 0 ; RV32I-WITHFP-NEXT: lui a0, 74565 -; RV32I-WITHFP-NEXT: addi a0, a0, 1680 +; RV32I-WITHFP-NEXT: addi a0, a0, -352 ; RV32I-WITHFP-NEXT: sub sp, sp, a0 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 305419920 ; RV32I-WITHFP-NEXT: lui a0, 74565 -; RV32I-WITHFP-NEXT: addi a0, a0, 1676 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: sw ra, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 74565 -; RV32I-WITHFP-NEXT: addi a0, a0, 1672 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: sw s0, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 74565 -; RV32I-WITHFP-NEXT: addi a0, a0, 1680 -; RV32I-WITHFP-NEXT: add s0, sp, a0 -; RV32I-WITHFP-NEXT: lui a0, 74565 -; RV32I-WITHFP-NEXT: addi a0, a0, 1672 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: lw s0, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 74565 -; RV32I-WITHFP-NEXT: addi a0, a0, 1676 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: lw ra, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 74565 -; RV32I-WITHFP-NEXT: addi a0, a0, 1680 +; RV32I-WITHFP-NEXT: addi a0, a0, -352 ; RV32I-WITHFP-NEXT: add sp, sp, a0 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-WITHFP-NEXT: lw s0, 2024(sp) +; RV32I-WITHFP-NEXT: .cfi_def_cfa sp, 305419920 +; RV32I-WITHFP-NEXT: lw ra, 2028(sp) +; RV32I-WITHFP-NEXT: .cfi_restore ra +; RV32I-WITHFP-NEXT: .cfi_restore s0 +; RV32I-WITHFP-NEXT: addi sp, sp, 2032 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 0 ; RV32I-WITHFP-NEXT: ret %tmp = alloca [ 305419896 x i8 ] , align 4 ret void @@ -52,20 +52,19 @@ define void @test() nounwind { ; This test case artificially produces register pressure which should force ; use of the emergency spill slot. -define void @test_emergency_spill_slot(i32 %a) nounwind { +define void @test_emergency_spill_slot(i32 %a) { ; RV32I-FPELIM-LABEL: test_emergency_spill_slot: ; RV32I-FPELIM: # %bb.0: -; RV32I-FPELIM-NEXT: lui a1, 98 -; RV32I-FPELIM-NEXT: addi a1, a1, -1392 +; RV32I-FPELIM-NEXT: addi sp, sp, -2032 +; RV32I-FPELIM-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-FPELIM-NEXT: sw s0, 2028(sp) +; RV32I-FPELIM-NEXT: sw s1, 2024(sp) +; RV32I-FPELIM-NEXT: .cfi_offset s0, -4 +; RV32I-FPELIM-NEXT: .cfi_offset s1, -8 +; RV32I-FPELIM-NEXT: lui a1, 97 +; RV32I-FPELIM-NEXT: addi a1, a1, 672 ; RV32I-FPELIM-NEXT: sub sp, sp, a1 -; RV32I-FPELIM-NEXT: lui a1, 98 -; RV32I-FPELIM-NEXT: addi a1, a1, -1396 -; RV32I-FPELIM-NEXT: add a1, sp, a1 -; RV32I-FPELIM-NEXT: sw s0, 0(a1) -; RV32I-FPELIM-NEXT: lui a1, 98 -; RV32I-FPELIM-NEXT: addi a1, a1, -1400 -; RV32I-FPELIM-NEXT: add a1, sp, a1 -; RV32I-FPELIM-NEXT: sw s1, 0(a1) +; RV32I-FPELIM-NEXT: .cfi_def_cfa_offset 400016 ; RV32I-FPELIM-NEXT: lui a1, 78 ; RV32I-FPELIM-NEXT: addi a1, a1, 512 ; RV32I-FPELIM-NEXT: addi a2, sp, 8 @@ -77,43 +76,36 @@ define void @test_emergency_spill_slot(i32 %a) nounwind { ; RV32I-FPELIM-NEXT: #APP ; RV32I-FPELIM-NEXT: nop ; RV32I-FPELIM-NEXT: #NO_APP -; RV32I-FPELIM-NEXT: lui a0, 98 -; RV32I-FPELIM-NEXT: addi a0, a0, -1400 -; RV32I-FPELIM-NEXT: add a0, sp, a0 -; RV32I-FPELIM-NEXT: lw s1, 0(a0) -; RV32I-FPELIM-NEXT: lui a0, 98 -; RV32I-FPELIM-NEXT: addi a0, a0, -1396 -; RV32I-FPELIM-NEXT: add a0, sp, a0 -; RV32I-FPELIM-NEXT: lw s0, 0(a0) -; RV32I-FPELIM-NEXT: lui a0, 98 -; RV32I-FPELIM-NEXT: addi a0, a0, -1392 +; RV32I-FPELIM-NEXT: lui a0, 97 +; RV32I-FPELIM-NEXT: addi a0, a0, 672 ; RV32I-FPELIM-NEXT: add sp, sp, a0 +; RV32I-FPELIM-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-FPELIM-NEXT: lw s1, 2024(sp) +; RV32I-FPELIM-NEXT: lw s0, 2028(sp) +; RV32I-FPELIM-NEXT: .cfi_restore s0 +; RV32I-FPELIM-NEXT: .cfi_restore s1 +; RV32I-FPELIM-NEXT: addi sp, sp, 2032 +; RV32I-FPELIM-NEXT: .cfi_def_cfa_offset 0 ; RV32I-FPELIM-NEXT: ret ; ; RV32I-WITHFP-LABEL: test_emergency_spill_slot: ; RV32I-WITHFP: # %bb.0: -; RV32I-WITHFP-NEXT: lui a1, 98 -; RV32I-WITHFP-NEXT: addi a1, a1, -1376 +; RV32I-WITHFP-NEXT: addi sp, sp, -2032 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-WITHFP-NEXT: sw ra, 2028(sp) +; RV32I-WITHFP-NEXT: sw s0, 2024(sp) +; RV32I-WITHFP-NEXT: sw s1, 2020(sp) +; RV32I-WITHFP-NEXT: sw s2, 2016(sp) +; RV32I-WITHFP-NEXT: .cfi_offset ra, -4 +; RV32I-WITHFP-NEXT: .cfi_offset s0, -8 +; RV32I-WITHFP-NEXT: .cfi_offset s1, -12 +; RV32I-WITHFP-NEXT: .cfi_offset s2, -16 +; RV32I-WITHFP-NEXT: addi s0, sp, 2032 +; RV32I-WITHFP-NEXT: .cfi_def_cfa s0, 0 +; RV32I-WITHFP-NEXT: lui a1, 97 +; RV32I-WITHFP-NEXT: addi a1, a1, 688 ; RV32I-WITHFP-NEXT: sub sp, sp, a1 -; RV32I-WITHFP-NEXT: lui a1, 98 -; RV32I-WITHFP-NEXT: addi a1, a1, -1380 -; RV32I-WITHFP-NEXT: add a1, sp, a1 -; RV32I-WITHFP-NEXT: sw ra, 0(a1) -; RV32I-WITHFP-NEXT: lui a1, 98 -; RV32I-WITHFP-NEXT: addi a1, a1, -1384 -; RV32I-WITHFP-NEXT: add a1, sp, a1 -; RV32I-WITHFP-NEXT: sw s0, 0(a1) -; RV32I-WITHFP-NEXT: lui a1, 98 -; RV32I-WITHFP-NEXT: addi a1, a1, -1388 -; RV32I-WITHFP-NEXT: add a1, sp, a1 -; RV32I-WITHFP-NEXT: sw s1, 0(a1) -; RV32I-WITHFP-NEXT: lui a1, 98 -; RV32I-WITHFP-NEXT: addi a1, a1, -1392 -; RV32I-WITHFP-NEXT: add a1, sp, a1 -; RV32I-WITHFP-NEXT: sw s2, 0(a1) -; RV32I-WITHFP-NEXT: lui a1, 98 -; RV32I-WITHFP-NEXT: addi a1, a1, -1376 -; RV32I-WITHFP-NEXT: add s0, sp, a1 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 400032 ; RV32I-WITHFP-NEXT: lui a1, 78 ; RV32I-WITHFP-NEXT: addi a1, a1, 512 ; RV32I-WITHFP-NEXT: lui a2, 1048478 @@ -128,25 +120,21 @@ define void @test_emergency_spill_slot(i32 %a) nounwind { ; RV32I-WITHFP-NEXT: #APP ; RV32I-WITHFP-NEXT: nop ; RV32I-WITHFP-NEXT: #NO_APP -; RV32I-WITHFP-NEXT: lui a0, 98 -; RV32I-WITHFP-NEXT: addi a0, a0, -1392 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: lw s2, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 98 -; RV32I-WITHFP-NEXT: addi a0, a0, -1388 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: lw s1, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 98 -; RV32I-WITHFP-NEXT: addi a0, a0, -1384 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: lw s0, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 98 -; RV32I-WITHFP-NEXT: addi a0, a0, -1380 -; RV32I-WITHFP-NEXT: add a0, sp, a0 -; RV32I-WITHFP-NEXT: lw ra, 0(a0) -; RV32I-WITHFP-NEXT: lui a0, 98 -; RV32I-WITHFP-NEXT: addi a0, a0, -1376 +; RV32I-WITHFP-NEXT: lui a0, 97 +; RV32I-WITHFP-NEXT: addi a0, a0, 688 ; RV32I-WITHFP-NEXT: add sp, sp, a0 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 2032 +; RV32I-WITHFP-NEXT: lw s2, 2016(sp) +; RV32I-WITHFP-NEXT: lw s1, 2020(sp) +; RV32I-WITHFP-NEXT: lw s0, 2024(sp) +; RV32I-WITHFP-NEXT: .cfi_def_cfa sp, 400032 +; RV32I-WITHFP-NEXT: lw ra, 2028(sp) +; RV32I-WITHFP-NEXT: .cfi_restore ra +; RV32I-WITHFP-NEXT: .cfi_restore s0 +; RV32I-WITHFP-NEXT: .cfi_restore s1 +; RV32I-WITHFP-NEXT: .cfi_restore s2 +; RV32I-WITHFP-NEXT: addi sp, sp, 2032 +; RV32I-WITHFP-NEXT: .cfi_def_cfa_offset 0 ; RV32I-WITHFP-NEXT: ret %data = alloca [ 100000 x i32 ] , align 4 %ptr = getelementptr inbounds [100000 x i32], [100000 x i32]* %data, i32 0, i32 80000 diff --git a/llvm/test/CodeGen/RISCV/rv64-large-stack.ll b/llvm/test/CodeGen/RISCV/rv64-large-stack.ll index bf862ac52aa84b..dbe19cc60e29fd 100644 --- a/llvm/test/CodeGen/RISCV/rv64-large-stack.ll +++ b/llvm/test/CodeGen/RISCV/rv64-large-stack.ll @@ -7,26 +7,22 @@ define void @foo() nounwind { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -2032 +; CHECK-NEXT: sd ra, 2024(sp) ; CHECK-NEXT: lui a0, 95 ; CHECK-NEXT: addiw a0, a0, 1505 ; CHECK-NEXT: slli a0, a0, 13 -; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: addi a0, a0, -2000 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: lui a0, 781250 -; CHECK-NEXT: addiw a0, a0, 24 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: sd ra, 0(a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: call baz -; CHECK-NEXT: lui a0, 781250 -; CHECK-NEXT: addiw a0, a0, 24 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: ld ra, 0(a0) ; CHECK-NEXT: lui a0, 95 ; CHECK-NEXT: addiw a0, a0, 1505 ; CHECK-NEXT: slli a0, a0, 13 -; CHECK-NEXT: addi a0, a0, 32 +; CHECK-NEXT: addi a0, a0, -2000 ; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 2024(sp) +; CHECK-NEXT: addi sp, sp, 2032 ; CHECK-NEXT: ret entry: %w = alloca [100000000 x { fp128, fp128 }], align 16 diff --git a/llvm/test/CodeGen/RISCV/split-sp-adjust.ll b/llvm/test/CodeGen/RISCV/split-sp-adjust.ll new file mode 100644 index 00000000000000..49f8e7010d6642 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/split-sp-adjust.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I + +; The stack size is 2048 and the SP adjustment will be split. +define i32 @SplitSP() nounwind { +; RV32I-LABEL: SplitSP: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: sw ra, 2028(sp) +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: call foo +; RV32I-NEXT: mv a0, zero +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: lw ra, 2028(sp) +; RV32I-NEXT: addi sp, sp, 2032 +; RV32I-NEXT: ret +entry: + %xx = alloca [2028 x i8], align 1 + %0 = getelementptr inbounds [2028 x i8], [2028 x i8]* %xx, i32 0, i32 0 + %call = call i32 @foo(i8* nonnull %0) + ret i32 0 +} + +; The stack size is 2032 and the SP adjustment will not be split. +define i32 @NoSplitSP() nounwind { +; RV32I-LABEL: NoSplitSP: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: sw ra, 2028(sp) +; RV32I-NEXT: addi a0, sp, 4 +; RV32I-NEXT: call foo +; RV32I-NEXT: mv a0, zero +; RV32I-NEXT: lw ra, 2028(sp) +; RV32I-NEXT: addi sp, sp, 2032 +; RV32I-NEXT: ret +entry: + %xx = alloca [2024 x i8], align 1 + %0 = getelementptr inbounds [2024 x i8], [2024 x i8]* %xx, i32 0, i32 0 + %call = call i32 @foo(i8* nonnull %0) + ret i32 0 +} + +declare i32 @foo(i8*) diff --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll new file mode 100644 index 00000000000000..5211e5291a26c7 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -0,0 +1,583 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + +define i32 @fold_srem_positive_odd(i32 %x) { +; RV32I-LABEL: fold_srem_positive_odd: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_positive_odd: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 706409 +; RV32IM-NEXT: addi a1, a1, 389 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: add a1, a1, a0 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 6 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_positive_odd: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_positive_odd: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 1045903 +; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -905 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -1767 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: add a1, a1, a0 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; RV32I-LABEL: fold_srem_positive_even: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 1060 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_positive_even: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 253241 +; RV32IM-NEXT: addi a1, a1, -15 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 8 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, 1060 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_positive_even: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: addi a1, zero, 1060 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_positive_even: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 506482 +; RV64IM-NEXT: addiw a1, a1, -31 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 711 +; RV64IM-NEXT: slli a1, a1, 19 +; RV64IM-NEXT: addi a1, a1, 1979 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 1060 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; RV32I-LABEL: fold_srem_negative_odd: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, -723 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_negative_odd: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 677296 +; RV32IM-NEXT: addi a1, a1, -91 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 8 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, -723 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_negative_odd: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: addi a1, zero, -723 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_negative_odd: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 4781 +; RV64IM-NEXT: addiw a1, a1, 2045 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 1371 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -11 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -1355 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: sub a1, a1, a0 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 9 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, -723 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; RV32I-LABEL: fold_srem_negative_even: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: lui a1, 1048570 +; RV32I-NEXT: addi a1, a1, 1595 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_negative_even: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 1036895 +; RV32IM-NEXT: addi a1, a1, 999 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 8 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: lui a2, 1048570 +; RV32IM-NEXT: addi a2, a2, 1595 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_negative_even: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: lui a1, 1048570 +; RV64I-NEXT: addiw a1, a1, 1595 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_negative_even: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a0, a0 +; RV64IM-NEXT: lui a1, 1036895 +; RV64IM-NEXT: addiw a1, a1, 999 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 11 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -523 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -481 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 12 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: lui a2, 1048570 +; RV64IM-NEXT: addiw a2, a2, 1595 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; RV32I-LABEL: combine_srem_sdiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_srem_sdiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 706409 +; RV32IM-NEXT: addi a1, a1, 389 +; RV32IM-NEXT: mulh a1, a0, a1 +; RV32IM-NEXT: add a1, a1, a0 +; RV32IM-NEXT: srli a2, a1, 31 +; RV32IM-NEXT: srai a1, a1, 6 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a2, a1, a2 +; RV32IM-NEXT: sub a0, a0, a2 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_srem_sdiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: .cfi_def_cfa_offset 32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: sext.w s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: addw a0, s1, a0 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_srem_sdiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: lui a2, 1045903 +; RV64IM-NEXT: addiw a2, a2, -733 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, 1035 +; RV64IM-NEXT: slli a2, a2, 12 +; RV64IM-NEXT: addi a2, a2, -905 +; RV64IM-NEXT: slli a2, a2, 12 +; RV64IM-NEXT: addi a2, a2, -1767 +; RV64IM-NEXT: mulh a2, a1, a2 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 6 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a2, a1, a2 +; RV64IM-NEXT: sub a0, a0, a2 +; RV64IM-NEXT: addw a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; RV32I-LABEL: dont_fold_srem_power_of_two: +; RV32I: # %bb.0: +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: srli a1, a1, 26 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: andi a1, a1, -64 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_power_of_two: +; RV32IM: # %bb.0: +; RV32IM-NEXT: srai a1, a0, 31 +; RV32IM-NEXT: srli a1, a1, 26 +; RV32IM-NEXT: add a1, a0, a1 +; RV32IM-NEXT: andi a1, a1, -64 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_power_of_two: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: srli a1, a1, 57 +; RV64I-NEXT: andi a1, a1, 63 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: addi a2, zero, 1 +; RV64I-NEXT: slli a2, a2, 32 +; RV64I-NEXT: addi a2, a2, -64 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_power_of_two: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: srli a1, a1, 57 +; RV64IM-NEXT: andi a1, a1, 63 +; RV64IM-NEXT: add a1, a0, a1 +; RV64IM-NEXT: addi a2, zero, 1 +; RV64IM-NEXT: slli a2, a2, 32 +; RV64IM-NEXT: addi a2, a2, -64 +; RV64IM-NEXT: and a1, a1, a2 +; RV64IM-NEXT: subw a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a0, zero +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; RV32I-LABEL: dont_fold_srem_i32_smax: +; RV32I: # %bb.0: +; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: lui a2, 524288 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_i32_smax: +; RV32IM: # %bb.0: +; RV32IM-NEXT: srai a1, a0, 31 +; RV32IM-NEXT: srli a1, a1, 1 +; RV32IM-NEXT: add a1, a0, a1 +; RV32IM-NEXT: lui a2, 524288 +; RV32IM-NEXT: and a1, a1, a2 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_i32_smax: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a1, a0 +; RV64I-NEXT: srli a1, a1, 32 +; RV64I-NEXT: lui a2, 524288 +; RV64I-NEXT: addiw a2, a2, -1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: add a1, a0, a1 +; RV64I-NEXT: addi a2, zero, 1 +; RV64I-NEXT: slli a2, a2, 31 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: addw a0, a0, a1 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_i32_smax: +; RV64IM: # %bb.0: +; RV64IM-NEXT: sext.w a1, a0 +; RV64IM-NEXT: srli a1, a1, 32 +; RV64IM-NEXT: lui a2, 524288 +; RV64IM-NEXT: addiw a2, a2, -1 +; RV64IM-NEXT: and a1, a1, a2 +; RV64IM-NEXT: add a1, a0, a1 +; RV64IM-NEXT: addi a2, zero, 1 +; RV64IM-NEXT: slli a2, a2, 31 +; RV64IM-NEXT: and a1, a1, a2 +; RV64IM-NEXT: addw a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; RV32I-LABEL: dont_fold_srem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a2, zero, 98 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -16 +; RV32IM-NEXT: .cfi_def_cfa_offset 16 +; RV32IM-NEXT: sw ra, 12(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: addi a2, zero, 98 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: lw ra, 12(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lui a1, 2675 +; RV64IM-NEXT: addiw a1, a1, -251 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 1839 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 167 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, 1505 +; RV64IM-NEXT: mulh a1, a0, a1 +; RV64IM-NEXT: srli a2, a1, 63 +; RV64IM-NEXT: srai a1, a1, 5 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: addi a2, zero, 98 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll new file mode 100644 index 00000000000000..ad7af93fce2f99 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -0,0 +1,1689 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; RV32I-LABEL: fold_srem_vec_1: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh s3, 8(a1) +; RV32I-NEXT: lh s0, 4(a1) +; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, -124 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 98 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, -1003 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_vec_1: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 12(a1) +; RV32IM-NEXT: lh a3, 8(a1) +; RV32IM-NEXT: lh a4, 0(a1) +; RV32IM-NEXT: lh a1, 4(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a5, a4, a5 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a2, a5, 31 +; RV32IM-NEXT: srli a5, a5, 6 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: lui a4, 507375 +; RV32IM-NEXT: addi a4, a4, 1981 +; RV32IM-NEXT: mulh a4, a1, a4 +; RV32IM-NEXT: sub a4, a4, a1 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, -124 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 342392 +; RV32IM-NEXT: addi a4, a4, 669 +; RV32IM-NEXT: mulh a4, a3, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 5 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 98 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lui a4, 780943 +; RV32IM-NEXT: addi a4, a4, 1809 +; RV32IM-NEXT: mulh a4, a6, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, -1003 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a4, a6, a4 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_vec_1: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh s3, 16(a1) +; RV64I-NEXT: lh s0, 8(a1) +; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, -124 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, -1003 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_vec_1: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 24(a1) +; RV64IM-NEXT: lh a3, 16(a1) +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: lh a1, 0(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: lui a2, 248 +; RV64IM-NEXT: addiw a2, a2, -1057 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, -1057 +; RV64IM-NEXT: slli a2, a2, 15 +; RV64IM-NEXT: addi a2, a2, -1057 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, -265 +; RV64IM-NEXT: mulh a2, a4, a2 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: srli a5, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: addi a5, zero, -124 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: lui a4, 2675 +; RV64IM-NEXT: addiw a4, a4, -251 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1839 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 167 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1505 +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 5 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 98 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 1040212 +; RV64IM-NEXT: addiw a4, a4, 1977 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1907 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -453 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1213 +; RV64IM-NEXT: mulh a4, a6, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, -1003 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a4, a6, a4 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; RV32I-LABEL: fold_srem_vec_2: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh s3, 8(a1) +; RV32I-NEXT: lh s0, 4(a1) +; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_srem_vec_2: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 12(a1) +; RV32IM-NEXT: lh a3, 8(a1) +; RV32IM-NEXT: lh a4, 0(a1) +; RV32IM-NEXT: lh a1, 4(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a2, a4, a5 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a7, a2, 31 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: add a2, a2, a7 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub t0, a4, a2 +; RV32IM-NEXT: mulh a4, a1, a5 +; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: srli a2, a4, 31 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: mulh a2, a3, a5 +; RV32IM-NEXT: add a2, a2, a3 +; RV32IM-NEXT: srli a4, a2, 31 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: mulh a3, a6, a5 +; RV32IM-NEXT: add a3, a3, a6 +; RV32IM-NEXT: srli a4, a3, 31 +; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: mul a3, a3, a7 +; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_srem_vec_2: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh s3, 16(a1) +; RV64I-NEXT: lh s0, 8(a1) +; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_srem_vec_2: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 24(a1) +; RV64IM-NEXT: lh a7, 16(a1) +; RV64IM-NEXT: lh a4, 8(a1) +; RV64IM-NEXT: lh a1, 0(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: srli a3, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a3 +; RV64IM-NEXT: addi a3, zero, 95 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub t0, a1, a2 +; RV64IM-NEXT: mulh a2, a4, a5 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: srli a1, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: mul a1, a1, a3 +; RV64IM-NEXT: sub a1, a4, a1 +; RV64IM-NEXT: mulh a2, a7, a5 +; RV64IM-NEXT: add a2, a2, a7 +; RV64IM-NEXT: srli a4, a2, 63 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: add a2, a2, a4 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub a2, a7, a2 +; RV64IM-NEXT: mulh a4, a6, a5 +; RV64IM-NEXT: add a4, a4, a6 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; RV32I-LABEL: combine_srem_sdiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lh s2, 0(a1) +; RV32I-NEXT: lh s3, 4(a1) +; RV32I-NEXT: lh s4, 8(a1) +; RV32I-NEXT: lh s1, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s8, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __divsi3 +; RV32I-NEXT: add a0, s8, a0 +; RV32I-NEXT: add a1, s7, s1 +; RV32I-NEXT: add a2, s6, s4 +; RV32I-NEXT: add a3, s5, s9 +; RV32I-NEXT: sh a3, 6(s0) +; RV32I-NEXT: sh a2, 4(s0) +; RV32I-NEXT: sh a1, 2(s0) +; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_srem_sdiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 0(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 12(a1) +; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a2, a4, a5 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a7, a2, 31 +; RV32IM-NEXT: srai a2, a2, 6 +; RV32IM-NEXT: add t0, a2, a7 +; RV32IM-NEXT: addi a7, zero, 95 +; RV32IM-NEXT: mul a2, t0, a7 +; RV32IM-NEXT: sub t1, a4, a2 +; RV32IM-NEXT: mulh a4, a1, a5 +; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: srli a2, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: mul a4, a2, a7 +; RV32IM-NEXT: sub t2, a1, a4 +; RV32IM-NEXT: mulh a4, a3, a5 +; RV32IM-NEXT: add a4, a4, a3 +; RV32IM-NEXT: srli a1, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: mul a4, a1, a7 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: mulh a4, a6, a5 +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srai a4, a4, 6 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: mul a5, a4, a7 +; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: add a1, a3, a1 +; RV32IM-NEXT: add a2, t2, a2 +; RV32IM-NEXT: add a3, t1, t0 +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_srem_sdiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: .cfi_def_cfa_offset 96 +; RV64I-NEXT: sd ra, 88(sp) +; RV64I-NEXT: sd s0, 80(sp) +; RV64I-NEXT: sd s1, 72(sp) +; RV64I-NEXT: sd s2, 64(sp) +; RV64I-NEXT: sd s3, 56(sp) +; RV64I-NEXT: sd s4, 48(sp) +; RV64I-NEXT: sd s5, 40(sp) +; RV64I-NEXT: sd s6, 32(sp) +; RV64I-NEXT: sd s7, 24(sp) +; RV64I-NEXT: sd s8, 16(sp) +; RV64I-NEXT: sd s9, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: .cfi_offset s6, -64 +; RV64I-NEXT: .cfi_offset s7, -72 +; RV64I-NEXT: .cfi_offset s8, -80 +; RV64I-NEXT: .cfi_offset s9, -88 +; RV64I-NEXT: lh s2, 0(a1) +; RV64I-NEXT: lh s3, 8(a1) +; RV64I-NEXT: lh s4, 16(a1) +; RV64I-NEXT: lh s1, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s6, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s7, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s8, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __divdi3 +; RV64I-NEXT: add a0, s8, a0 +; RV64I-NEXT: add a1, s7, s1 +; RV64I-NEXT: add a2, s6, s4 +; RV64I-NEXT: add a3, s5, s9 +; RV64I-NEXT: sh a3, 6(s0) +; RV64I-NEXT: sh a2, 4(s0) +; RV64I-NEXT: sh a1, 2(s0) +; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: ld s9, 8(sp) +; RV64I-NEXT: ld s8, 16(sp) +; RV64I-NEXT: ld s7, 24(sp) +; RV64I-NEXT: ld s6, 32(sp) +; RV64I-NEXT: ld s5, 40(sp) +; RV64I-NEXT: ld s4, 48(sp) +; RV64I-NEXT: ld s3, 56(sp) +; RV64I-NEXT: ld s2, 64(sp) +; RV64I-NEXT: ld s1, 72(sp) +; RV64I-NEXT: ld s0, 80(sp) +; RV64I-NEXT: ld ra, 88(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: .cfi_restore s6 +; RV64I-NEXT: .cfi_restore s7 +; RV64I-NEXT: .cfi_restore s8 +; RV64I-NEXT: .cfi_restore s9 +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_srem_sdiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 0(a1) +; RV64IM-NEXT: lh a7, 8(a1) +; RV64IM-NEXT: lh a4, 16(a1) +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a2, a1, a5 +; RV64IM-NEXT: add a2, a2, a1 +; RV64IM-NEXT: srli a3, a2, 63 +; RV64IM-NEXT: srai a2, a2, 6 +; RV64IM-NEXT: add t3, a2, a3 +; RV64IM-NEXT: addi t0, zero, 95 +; RV64IM-NEXT: mul a3, t3, t0 +; RV64IM-NEXT: sub t1, a1, a3 +; RV64IM-NEXT: mulh a3, a4, a5 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: srli a1, a3, 63 +; RV64IM-NEXT: srai a3, a3, 6 +; RV64IM-NEXT: add a1, a3, a1 +; RV64IM-NEXT: mul a3, a1, t0 +; RV64IM-NEXT: sub t2, a4, a3 +; RV64IM-NEXT: mulh a4, a7, a5 +; RV64IM-NEXT: add a4, a4, a7 +; RV64IM-NEXT: srli a3, a4, 63 +; RV64IM-NEXT: srai a4, a4, 6 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: mul a4, a3, t0 +; RV64IM-NEXT: sub a4, a7, a4 +; RV64IM-NEXT: mulh a5, a6, a5 +; RV64IM-NEXT: add a5, a5, a6 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srai a5, a5, 6 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: mul a5, a2, t0 +; RV64IM-NEXT: sub a5, a6, a5 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: add a1, t2, a1 +; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_srem_power_of_two: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh a2, 0(a1) +; RV32I-NEXT: lh a0, 12(a1) +; RV32I-NEXT: lh a3, 8(a1) +; RV32I-NEXT: lh a1, 4(a1) +; RV32I-NEXT: srai a4, a2, 31 +; RV32I-NEXT: srli a4, a4, 26 +; RV32I-NEXT: add a4, a2, a4 +; RV32I-NEXT: lui a6, 16 +; RV32I-NEXT: addi a5, a6, -64 +; RV32I-NEXT: and a4, a4, a5 +; RV32I-NEXT: sub s2, a2, a4 +; RV32I-NEXT: srai a2, a1, 31 +; RV32I-NEXT: srli a2, a2, 27 +; RV32I-NEXT: add a2, a1, a2 +; RV32I-NEXT: addi a4, a6, -32 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: sub s3, a1, a2 +; RV32I-NEXT: srai a1, a3, 31 +; RV32I-NEXT: srli a1, a1, 29 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: addi a2, a6, -8 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: sub s1, a3, a1 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: sh s2, 0(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_power_of_two: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a6, 8(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a4, 12(a1) +; RV32IM-NEXT: lh a1, 0(a1) +; RV32IM-NEXT: lui a5, 706409 +; RV32IM-NEXT: addi a5, a5, 389 +; RV32IM-NEXT: mulh a5, a4, a5 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a2, a5, 31 +; RV32IM-NEXT: srli a5, a5, 6 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a7, a4, a2 +; RV32IM-NEXT: srai a4, a1, 31 +; RV32IM-NEXT: srli a4, a4, 26 +; RV32IM-NEXT: add a4, a1, a4 +; RV32IM-NEXT: lui a5, 16 +; RV32IM-NEXT: addi a2, a5, -64 +; RV32IM-NEXT: and a2, a4, a2 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: srai a2, a3, 31 +; RV32IM-NEXT: srli a2, a2, 27 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: addi a4, a5, -32 +; RV32IM-NEXT: and a2, a2, a4 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: srai a3, a6, 31 +; RV32IM-NEXT: srli a3, a3, 29 +; RV32IM-NEXT: add a3, a6, a3 +; RV32IM-NEXT: addi a4, a5, -8 +; RV32IM-NEXT: and a3, a3, a4 +; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a1, 0(a0) +; RV32IM-NEXT: sh a7, 6(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_power_of_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh a2, 0(a1) +; RV64I-NEXT: lh a0, 24(a1) +; RV64I-NEXT: lh a3, 16(a1) +; RV64I-NEXT: lh a1, 8(a1) +; RV64I-NEXT: srai a4, a2, 63 +; RV64I-NEXT: srli a4, a4, 58 +; RV64I-NEXT: add a4, a2, a4 +; RV64I-NEXT: lui a6, 16 +; RV64I-NEXT: addiw a5, a6, -64 +; RV64I-NEXT: and a4, a4, a5 +; RV64I-NEXT: sub s2, a2, a4 +; RV64I-NEXT: srai a2, a1, 63 +; RV64I-NEXT: srli a2, a2, 59 +; RV64I-NEXT: add a2, a1, a2 +; RV64I-NEXT: addiw a4, a6, -32 +; RV64I-NEXT: and a2, a2, a4 +; RV64I-NEXT: sub s3, a1, a2 +; RV64I-NEXT: srai a1, a3, 63 +; RV64I-NEXT: srli a1, a1, 61 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: addiw a2, a6, -8 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sub s1, a3, a1 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: sh s2, 0(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_power_of_two: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a6, 16(a1) +; RV64IM-NEXT: lh a3, 8(a1) +; RV64IM-NEXT: lh a4, 0(a1) +; RV64IM-NEXT: lh a1, 24(a1) +; RV64IM-NEXT: lui a5, 1045903 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -905 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1767 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a7, a1, a2 +; RV64IM-NEXT: srai a2, a4, 63 +; RV64IM-NEXT: srli a2, a2, 58 +; RV64IM-NEXT: add a2, a4, a2 +; RV64IM-NEXT: lui a5, 16 +; RV64IM-NEXT: addiw a1, a5, -64 +; RV64IM-NEXT: and a1, a2, a1 +; RV64IM-NEXT: sub a1, a4, a1 +; RV64IM-NEXT: srai a2, a3, 63 +; RV64IM-NEXT: srli a2, a2, 59 +; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: addiw a4, a5, -32 +; RV64IM-NEXT: and a2, a2, a4 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: srai a3, a6, 63 +; RV64IM-NEXT: srli a3, a3, 61 +; RV64IM-NEXT: add a3, a6, a3 +; RV64IM-NEXT: addiw a4, a5, -8 +; RV64IM-NEXT: and a3, a3, a4 +; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: sh a7, 6(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_srem_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh s1, 8(a1) +; RV32I-NEXT: lh a2, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_one: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a2, 12(a1) +; RV32IM-NEXT: lh a3, 4(a1) +; RV32IM-NEXT: lh a1, 8(a1) +; RV32IM-NEXT: lui a4, 820904 +; RV32IM-NEXT: addi a4, a4, -1903 +; RV32IM-NEXT: mulh a4, a3, a4 +; RV32IM-NEXT: add a4, a4, a3 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 9 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 654 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lui a4, 729444 +; RV32IM-NEXT: addi a4, a4, 713 +; RV32IM-NEXT: mulh a4, a1, a4 +; RV32IM-NEXT: add a4, a4, a1 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 4 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 395996 +; RV32IM-NEXT: addi a4, a4, -2009 +; RV32IM-NEXT: mulh a4, a2, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: sh zero, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh s1, 16(a1) +; RV64I-NEXT: lh a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_one: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a2, 24(a1) +; RV64IM-NEXT: lh a3, 8(a1) +; RV64IM-NEXT: lh a1, 16(a1) +; RV64IM-NEXT: lui a4, 1043590 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 357 +; RV64IM-NEXT: mulh a4, a1, a4 +; RV64IM-NEXT: add a4, a4, a1 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: lui a4, 6413 +; RV64IM-NEXT: addiw a4, a4, 1265 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1027 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1077 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 965 +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 8 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 12375 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, -431 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1959 +; RV64IM-NEXT: mulh a4, a2, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srli a4, a4, 11 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_urem_i16_smax: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lh a2, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lh s2, 12(a1) +; RV32I-NEXT: lh a0, 8(a1) +; RV32I-NEXT: slli a1, a2, 16 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: srli a1, a1, 17 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: lui a3, 8 +; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: sub s3, a2, a1 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __modsi3 +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_i16_smax: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lh a2, 4(a1) +; RV32IM-NEXT: slli a6, a2, 16 +; RV32IM-NEXT: lh a4, 8(a1) +; RV32IM-NEXT: lh a1, 12(a1) +; RV32IM-NEXT: lui a5, 729444 +; RV32IM-NEXT: addi a5, a5, 713 +; RV32IM-NEXT: mulh a5, a4, a5 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a3, a5, 31 +; RV32IM-NEXT: srli a5, a5, 4 +; RV32IM-NEXT: add a3, a5, a3 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: sub a3, a4, a3 +; RV32IM-NEXT: lui a4, 395996 +; RV32IM-NEXT: addi a4, a4, -2009 +; RV32IM-NEXT: mulh a4, a1, a4 +; RV32IM-NEXT: srli a5, a4, 31 +; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: srai a4, a6, 31 +; RV32IM-NEXT: srli a4, a4, 17 +; RV32IM-NEXT: add a4, a2, a4 +; RV32IM-NEXT: lui a5, 8 +; RV32IM-NEXT: and a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: sh zero, 0(a0) +; RV32IM-NEXT: sh a1, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_i16_smax: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lh a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lh s2, 24(a1) +; RV64I-NEXT: lh a0, 16(a1) +; RV64I-NEXT: slli a1, a2, 48 +; RV64I-NEXT: srai a1, a1, 63 +; RV64I-NEXT: srli a1, a1, 49 +; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: lui a3, 8 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: sub s3, a2, a1 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_i16_smax: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lh a2, 8(a1) +; RV64IM-NEXT: slli a6, a2, 48 +; RV64IM-NEXT: lh a4, 24(a1) +; RV64IM-NEXT: lh a1, 16(a1) +; RV64IM-NEXT: lui a5, 1043590 +; RV64IM-NEXT: addiw a5, a5, -1781 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1069 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, -1959 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 357 +; RV64IM-NEXT: mulh a5, a1, a5 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a3, a5, 63 +; RV64IM-NEXT: srli a5, a5, 4 +; RV64IM-NEXT: add a3, a5, a3 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: sub a1, a1, a3 +; RV64IM-NEXT: lui a3, 12375 +; RV64IM-NEXT: addiw a3, a3, -575 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, 883 +; RV64IM-NEXT: slli a3, a3, 13 +; RV64IM-NEXT: addi a3, a3, -431 +; RV64IM-NEXT: slli a3, a3, 12 +; RV64IM-NEXT: addi a3, a3, 1959 +; RV64IM-NEXT: mulh a3, a4, a3 +; RV64IM-NEXT: srli a5, a3, 63 +; RV64IM-NEXT: srli a3, a3, 11 +; RV64IM-NEXT: add a3, a3, a5 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: sub a3, a4, a3 +; RV64IM-NEXT: srai a4, a6, 63 +; RV64IM-NEXT: srli a4, a4, 49 +; RV64IM-NEXT: add a4, a2, a4 +; RV64IM-NEXT: lui a5, 8 +; RV64IM-NEXT: and a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; RV32I-LABEL: dont_fold_srem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lw s2, 24(a1) +; RV32I-NEXT: lw s3, 28(a1) +; RV32I-NEXT: lw s4, 16(a1) +; RV32I-NEXT: lw s5, 20(a1) +; RV32I-NEXT: lw s6, 8(a1) +; RV32I-NEXT: lw s1, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a2, zero, 1 +; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: addi a2, zero, 654 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s9, a1 +; RV32I-NEXT: addi a2, zero, 23 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a2, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __moddi3 +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s1, 20(s0) +; RV32I-NEXT: sw s4, 16(s0) +; RV32I-NEXT: sw s9, 12(s0) +; RV32I-NEXT: sw s6, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_srem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -48 +; RV32IM-NEXT: .cfi_def_cfa_offset 48 +; RV32IM-NEXT: sw ra, 44(sp) +; RV32IM-NEXT: sw s0, 40(sp) +; RV32IM-NEXT: sw s1, 36(sp) +; RV32IM-NEXT: sw s2, 32(sp) +; RV32IM-NEXT: sw s3, 28(sp) +; RV32IM-NEXT: sw s4, 24(sp) +; RV32IM-NEXT: sw s5, 20(sp) +; RV32IM-NEXT: sw s6, 16(sp) +; RV32IM-NEXT: sw s7, 12(sp) +; RV32IM-NEXT: sw s8, 8(sp) +; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: .cfi_offset s0, -8 +; RV32IM-NEXT: .cfi_offset s1, -12 +; RV32IM-NEXT: .cfi_offset s2, -16 +; RV32IM-NEXT: .cfi_offset s3, -20 +; RV32IM-NEXT: .cfi_offset s4, -24 +; RV32IM-NEXT: .cfi_offset s5, -28 +; RV32IM-NEXT: .cfi_offset s6, -32 +; RV32IM-NEXT: .cfi_offset s7, -36 +; RV32IM-NEXT: .cfi_offset s8, -40 +; RV32IM-NEXT: .cfi_offset s9, -44 +; RV32IM-NEXT: lw s2, 24(a1) +; RV32IM-NEXT: lw s3, 28(a1) +; RV32IM-NEXT: lw s4, 16(a1) +; RV32IM-NEXT: lw s5, 20(a1) +; RV32IM-NEXT: lw s6, 8(a1) +; RV32IM-NEXT: lw s1, 12(a1) +; RV32IM-NEXT: lw a3, 0(a1) +; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: addi a2, zero, 1 +; RV32IM-NEXT: mv a0, a3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: mv s7, a0 +; RV32IM-NEXT: mv s8, a1 +; RV32IM-NEXT: addi a2, zero, 654 +; RV32IM-NEXT: mv a0, s6 +; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s9, a1 +; RV32IM-NEXT: addi a2, zero, 23 +; RV32IM-NEXT: mv a0, s4 +; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: mv s4, a0 +; RV32IM-NEXT: mv s1, a1 +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a2, a0, 1327 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __moddi3 +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s1, 20(s0) +; RV32IM-NEXT: sw s4, 16(s0) +; RV32IM-NEXT: sw s9, 12(s0) +; RV32IM-NEXT: sw s6, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) +; RV32IM-NEXT: lw s9, 4(sp) +; RV32IM-NEXT: lw s8, 8(sp) +; RV32IM-NEXT: lw s7, 12(sp) +; RV32IM-NEXT: lw s6, 16(sp) +; RV32IM-NEXT: lw s5, 20(sp) +; RV32IM-NEXT: lw s4, 24(sp) +; RV32IM-NEXT: lw s3, 28(sp) +; RV32IM-NEXT: lw s2, 32(sp) +; RV32IM-NEXT: lw s1, 36(sp) +; RV32IM-NEXT: lw s0, 40(sp) +; RV32IM-NEXT: lw ra, 44(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: .cfi_restore s0 +; RV32IM-NEXT: .cfi_restore s1 +; RV32IM-NEXT: .cfi_restore s2 +; RV32IM-NEXT: .cfi_restore s3 +; RV32IM-NEXT: .cfi_restore s4 +; RV32IM-NEXT: .cfi_restore s5 +; RV32IM-NEXT: .cfi_restore s6 +; RV32IM-NEXT: .cfi_restore s7 +; RV32IM-NEXT: .cfi_restore s8 +; RV32IM-NEXT: .cfi_restore s9 +; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_srem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: ld s2, 24(a1) +; RV64I-NEXT: ld s1, 16(a1) +; RV64I-NEXT: ld a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __moddi3 +; RV64I-NEXT: sd zero, 0(s0) +; RV64I-NEXT: sd a0, 24(s0) +; RV64I-NEXT: sd s1, 16(s0) +; RV64I-NEXT: sd s3, 8(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_srem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: ld a2, 24(a1) +; RV64IM-NEXT: ld a3, 8(a1) +; RV64IM-NEXT: ld a1, 16(a1) +; RV64IM-NEXT: lui a4, 1043590 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 357 +; RV64IM-NEXT: mulh a4, a1, a4 +; RV64IM-NEXT: add a4, a4, a1 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srai a4, a4, 4 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: lui a4, 6413 +; RV64IM-NEXT: addiw a4, a4, 1265 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1027 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1077 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 965 +; RV64IM-NEXT: mulh a4, a3, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srai a4, a4, 8 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 12375 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, -431 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1959 +; RV64IM-NEXT: mulh a4, a2, a4 +; RV64IM-NEXT: srli a5, a4, 63 +; RV64IM-NEXT: srai a4, a4, 11 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sd zero, 0(a0) +; RV64IM-NEXT: sd a2, 24(a0) +; RV64IM-NEXT: sd a3, 8(a0) +; RV64IM-NEXT: sd a1, 16(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} diff --git a/llvm/test/CodeGen/RISCV/stack-realignment.ll b/llvm/test/CodeGen/RISCV/stack-realignment.ll index dd06d6f0bff0fe..c15e6e14b9ad30 100644 --- a/llvm/test/CodeGen/RISCV/stack-realignment.ll +++ b/llvm/test/CodeGen/RISCV/stack-realignment.ll @@ -309,20 +309,11 @@ define void @caller_no_realign512() nounwind "no-realign-stack" { define void @caller1024() nounwind { ; RV32I-LABEL: caller1024: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, -1024 -; RV32I-NEXT: sub sp, sp, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, -1028 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: sw ra, 0(a0) -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, -1032 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: sw s0, 0(a0) -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, -1024 -; RV32I-NEXT: add s0, sp, a0 +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: sw ra, 2028(sp) +; RV32I-NEXT: sw s0, 2024(sp) +; RV32I-NEXT: addi s0, sp, 2032 +; RV32I-NEXT: addi sp, sp, -1040 ; RV32I-NEXT: andi sp, sp, -1024 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a0, a0, -2048 @@ -332,35 +323,19 @@ define void @caller1024() nounwind { ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a0, a0, -1024 ; RV32I-NEXT: sub sp, s0, a0 -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, -1032 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lw s0, 0(a0) -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, -1028 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lw ra, 0(a0) -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, -1024 -; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: addi sp, sp, 1040 +; RV32I-NEXT: lw s0, 2024(sp) +; RV32I-NEXT: lw ra, 2028(sp) +; RV32I-NEXT: addi sp, sp, 2032 ; RV32I-NEXT: ret ; ; RV64I-LABEL: caller1024: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, -1024 -; RV64I-NEXT: sub sp, sp, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, -1032 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: sd ra, 0(a0) -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, -1040 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: sd s0, 0(a0) -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, -1024 -; RV64I-NEXT: add s0, sp, a0 +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: sd ra, 2024(sp) +; RV64I-NEXT: sd s0, 2016(sp) +; RV64I-NEXT: addi s0, sp, 2032 +; RV64I-NEXT: addi sp, sp, -1040 ; RV64I-NEXT: andi sp, sp, -1024 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a0, a0, -2048 @@ -370,17 +345,10 @@ define void @caller1024() nounwind { ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a0, a0, -1024 ; RV64I-NEXT: sub sp, s0, a0 -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, -1040 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: ld s0, 0(a0) -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, -1032 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: ld ra, 0(a0) -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, -1024 -; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: addi sp, sp, 1040 +; RV64I-NEXT: ld s0, 2016(sp) +; RV64I-NEXT: ld ra, 2024(sp) +; RV64I-NEXT: addi sp, sp, 2032 ; RV64I-NEXT: ret %1 = alloca i8, align 1024 call void @callee(i8* %1) @@ -415,20 +383,13 @@ define void @caller_no_realign1024() nounwind "no-realign-stack" { define void @caller2048() nounwind { ; RV32I-LABEL: caller2048: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a0, 2 -; RV32I-NEXT: addi a0, a0, -2048 -; RV32I-NEXT: sub sp, sp, a0 +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: sw ra, 2028(sp) +; RV32I-NEXT: sw s0, 2024(sp) +; RV32I-NEXT: addi s0, sp, 2032 ; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, 2044 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: sw ra, 0(a0) -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, 2040 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: sw s0, 0(a0) -; RV32I-NEXT: lui a0, 2 -; RV32I-NEXT: addi a0, a0, -2048 -; RV32I-NEXT: add s0, sp, a0 +; RV32I-NEXT: addi a0, a0, 16 +; RV32I-NEXT: sub sp, sp, a0 ; RV32I-NEXT: andi sp, sp, -2048 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: add a0, sp, a0 @@ -438,34 +399,22 @@ define void @caller2048() nounwind { ; RV32I-NEXT: addi a0, a0, -2048 ; RV32I-NEXT: sub sp, s0, a0 ; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, 2040 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lw s0, 0(a0) -; RV32I-NEXT: lui a0, 1 -; RV32I-NEXT: addi a0, a0, 2044 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lw ra, 0(a0) -; RV32I-NEXT: lui a0, 2 -; RV32I-NEXT: addi a0, a0, -2048 +; RV32I-NEXT: addi a0, a0, 16 ; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: lw s0, 2024(sp) +; RV32I-NEXT: lw ra, 2028(sp) +; RV32I-NEXT: addi sp, sp, 2032 ; RV32I-NEXT: ret ; ; RV64I-LABEL: caller2048: ; RV64I: # %bb.0: -; RV64I-NEXT: lui a0, 2 -; RV64I-NEXT: addiw a0, a0, -2048 -; RV64I-NEXT: sub sp, sp, a0 +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: sd ra, 2024(sp) +; RV64I-NEXT: sd s0, 2016(sp) +; RV64I-NEXT: addi s0, sp, 2032 ; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, 2040 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: sd ra, 0(a0) -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, 2032 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: sd s0, 0(a0) -; RV64I-NEXT: lui a0, 2 -; RV64I-NEXT: addiw a0, a0, -2048 -; RV64I-NEXT: add s0, sp, a0 +; RV64I-NEXT: addiw a0, a0, 16 +; RV64I-NEXT: sub sp, sp, a0 ; RV64I-NEXT: andi sp, sp, -2048 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: add a0, sp, a0 @@ -475,16 +424,11 @@ define void @caller2048() nounwind { ; RV64I-NEXT: addiw a0, a0, -2048 ; RV64I-NEXT: sub sp, s0, a0 ; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, 2032 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: ld s0, 0(a0) -; RV64I-NEXT: lui a0, 1 -; RV64I-NEXT: addiw a0, a0, 2040 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: ld ra, 0(a0) -; RV64I-NEXT: lui a0, 2 -; RV64I-NEXT: addiw a0, a0, -2048 +; RV64I-NEXT: addiw a0, a0, 16 ; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: ld s0, 2016(sp) +; RV64I-NEXT: ld ra, 2024(sp) +; RV64I-NEXT: addi sp, sp, 2032 ; RV64I-NEXT: ret %1 = alloca i8, align 2048 call void @callee(i8* %1) @@ -519,18 +463,13 @@ define void @caller_no_realign2048() nounwind "no-realign-stack" { define void @caller4096() nounwind { ; RV32I-LABEL: caller4096: ; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -2032 +; RV32I-NEXT: sw ra, 2028(sp) +; RV32I-NEXT: sw s0, 2024(sp) +; RV32I-NEXT: addi s0, sp, 2032 ; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, -2032 ; RV32I-NEXT: sub sp, sp, a0 -; RV32I-NEXT: lui a0, 3 -; RV32I-NEXT: addi a0, a0, -4 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: sw ra, 0(a0) -; RV32I-NEXT: lui a0, 3 -; RV32I-NEXT: addi a0, a0, -8 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: sw s0, 0(a0) -; RV32I-NEXT: lui a0, 3 -; RV32I-NEXT: add s0, sp, a0 ; RV32I-NEXT: srli a0, sp, 12 ; RV32I-NEXT: slli sp, a0, 12 ; RV32I-NEXT: lui a0, 2 @@ -540,31 +479,22 @@ define void @caller4096() nounwind { ; RV32I-NEXT: lui a0, 3 ; RV32I-NEXT: sub sp, s0, a0 ; RV32I-NEXT: lui a0, 3 -; RV32I-NEXT: addi a0, a0, -8 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lw s0, 0(a0) -; RV32I-NEXT: lui a0, 3 -; RV32I-NEXT: addi a0, a0, -4 -; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: lw ra, 0(a0) -; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, -2032 ; RV32I-NEXT: add sp, sp, a0 +; RV32I-NEXT: lw s0, 2024(sp) +; RV32I-NEXT: lw ra, 2028(sp) +; RV32I-NEXT: addi sp, sp, 2032 ; RV32I-NEXT: ret ; ; RV64I-LABEL: caller4096: ; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: sd ra, 2024(sp) +; RV64I-NEXT: sd s0, 2016(sp) +; RV64I-NEXT: addi s0, sp, 2032 ; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, -2032 ; RV64I-NEXT: sub sp, sp, a0 -; RV64I-NEXT: lui a0, 3 -; RV64I-NEXT: addiw a0, a0, -8 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: sd ra, 0(a0) -; RV64I-NEXT: lui a0, 3 -; RV64I-NEXT: addiw a0, a0, -16 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: sd s0, 0(a0) -; RV64I-NEXT: lui a0, 3 -; RV64I-NEXT: add s0, sp, a0 ; RV64I-NEXT: srli a0, sp, 12 ; RV64I-NEXT: slli sp, a0, 12 ; RV64I-NEXT: lui a0, 2 @@ -574,15 +504,11 @@ define void @caller4096() nounwind { ; RV64I-NEXT: lui a0, 3 ; RV64I-NEXT: sub sp, s0, a0 ; RV64I-NEXT: lui a0, 3 -; RV64I-NEXT: addiw a0, a0, -16 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: ld s0, 0(a0) -; RV64I-NEXT: lui a0, 3 -; RV64I-NEXT: addiw a0, a0, -8 -; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: ld ra, 0(a0) -; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, -2032 ; RV64I-NEXT: add sp, sp, a0 +; RV64I-NEXT: ld s0, 2016(sp) +; RV64I-NEXT: ld ra, 2024(sp) +; RV64I-NEXT: addi sp, sp, 2032 ; RV64I-NEXT: ret %1 = alloca i8, align 4096 call void @callee(i8* %1) diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll new file mode 100644 index 00000000000000..374ce07b2ac2d2 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -0,0 +1,354 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + +define i32 @fold_urem_positive_odd(i32 %x) { +; RV32I-LABEL: fold_urem_positive_odd: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_positive_odd: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 364242 +; RV32IM-NEXT: addi a1, a1, 777 +; RV32IM-NEXT: mulhu a1, a0, a1 +; RV32IM-NEXT: sub a2, a0, a1 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: srli a1, a1, 6 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_positive_odd: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_positive_odd: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: lui a1, 1423 +; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -1811 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 561 +; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; RV32I-LABEL: fold_urem_positive_even: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a1, zero, 1060 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_positive_even: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 1012964 +; RV32IM-NEXT: addi a1, a1, -61 +; RV32IM-NEXT: mulhu a1, a0, a1 +; RV32IM-NEXT: srli a1, a1, 10 +; RV32IM-NEXT: addi a2, zero, 1060 +; RV32IM-NEXT: mul a1, a1, a2 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_positive_even: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: addi a1, zero, 1060 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_positive_even: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: lui a1, 1048020 +; RV64IM-NEXT: addiw a1, a1, -1793 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 139 +; RV64IM-NEXT: slli a1, a1, 14 +; RV64IM-NEXT: addi a1, a1, 1793 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, -139 +; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: srli a1, a1, 10 +; RV64IM-NEXT: addi a2, zero, 1060 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; RV32I-LABEL: combine_urem_udiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: sw s0, 8(sp) +; RV32I-NEXT: sw s1, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: add a0, s1, a0 +; RV32I-NEXT: lw s1, 4(sp) +; RV32I-NEXT: lw s0, 8(sp) +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_urem_udiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a1, 364242 +; RV32IM-NEXT: addi a1, a1, 777 +; RV32IM-NEXT: mulhu a1, a0, a1 +; RV32IM-NEXT: sub a2, a0, a1 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: srli a1, a1, 6 +; RV32IM-NEXT: addi a2, zero, 95 +; RV32IM-NEXT: mul a2, a1, a2 +; RV32IM-NEXT: sub a0, a0, a2 +; RV32IM-NEXT: add a0, a0, a1 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_urem_udiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: .cfi_def_cfa_offset 32 +; RV64I-NEXT: sd ra, 24(sp) +; RV64I-NEXT: sd s0, 16(sp) +; RV64I-NEXT: sd s1, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: srli s0, a0, 32 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: add a0, s1, a0 +; RV64I-NEXT: ld s1, 8(sp) +; RV64I-NEXT: ld s0, 16(sp) +; RV64I-NEXT: ld ra, 24(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_urem_udiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: slli a0, a0, 32 +; RV64IM-NEXT: srli a0, a0, 32 +; RV64IM-NEXT: lui a1, 1423 +; RV64IM-NEXT: addiw a1, a1, -733 +; RV64IM-NEXT: slli a1, a1, 15 +; RV64IM-NEXT: addi a1, a1, 1035 +; RV64IM-NEXT: slli a1, a1, 13 +; RV64IM-NEXT: addi a1, a1, -1811 +; RV64IM-NEXT: slli a1, a1, 12 +; RV64IM-NEXT: addi a1, a1, 561 +; RV64IM-NEXT: mulhu a1, a0, a1 +; RV64IM-NEXT: sub a2, a0, a1 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a1, a2, a1 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: addi a2, zero, 95 +; RV64IM-NEXT: mul a2, a1, a2 +; RV64IM-NEXT: sub a0, a0, a2 +; RV64IM-NEXT: add a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 63 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: mv a0, zero +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; RV32I-LABEL: dont_fold_urem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: .cfi_def_cfa_offset 16 +; RV32I-NEXT: sw ra, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: addi a2, zero, 98 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: lw ra, 12(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -16 +; RV32IM-NEXT: .cfi_def_cfa_offset 16 +; RV32IM-NEXT: sw ra, 12(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: addi a2, zero, 98 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: lw ra, 12(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: .cfi_def_cfa_offset 16 +; RV64I-NEXT: sd ra, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: ld ra, 8(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: srli a1, a0, 1 +; RV64IM-NEXT: lui a2, 2675 +; RV64IM-NEXT: addiw a2, a2, -251 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, 1839 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, 167 +; RV64IM-NEXT: slli a2, a2, 13 +; RV64IM-NEXT: addi a2, a2, 1505 +; RV64IM-NEXT: mulhu a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 4 +; RV64IM-NEXT: addi a2, zero, 98 +; RV64IM-NEXT: mul a1, a1, a2 +; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll new file mode 100644 index 00000000000000..bab79aeb0ee083 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -0,0 +1,1419 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32I %s +; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV32IM %s +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64I %s +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,RV64IM %s + + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; RV32I-LABEL: fold_urem_vec_1: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s3, 8(a1) +; RV32I-NEXT: lhu s0, 4(a1) +; RV32I-NEXT: lhu a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 124 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 98 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 1003 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_vec_1: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 12(a1) +; RV32IM-NEXT: lhu a3, 8(a1) +; RV32IM-NEXT: lhu a4, 0(a1) +; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: srli a4, a1, 2 +; RV32IM-NEXT: lui a5, 135300 +; RV32IM-NEXT: addi a5, a5, 529 +; RV32IM-NEXT: mulhu a4, a4, a5 +; RV32IM-NEXT: srli a4, a4, 2 +; RV32IM-NEXT: addi a5, zero, 124 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 342392 +; RV32IM-NEXT: addi a4, a4, 669 +; RV32IM-NEXT: mulhu a4, a3, a4 +; RV32IM-NEXT: srli a4, a4, 5 +; RV32IM-NEXT: addi a5, zero, 98 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lui a4, 267633 +; RV32IM-NEXT: addi a4, a4, -1809 +; RV32IM-NEXT: mulhu a4, a6, a4 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: addi a5, zero, 1003 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a4, a6, a4 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_vec_1: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s3, 16(a1) +; RV64I-NEXT: lhu s0, 8(a1) +; RV64I-NEXT: lhu a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 124 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 98 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 1003 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_vec_1: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 24(a1) +; RV64IM-NEXT: lhu a3, 16(a1) +; RV64IM-NEXT: lhu a4, 8(a1) +; RV64IM-NEXT: lhu a1, 0(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a5, a1, a5 +; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: srli a2, a4, 2 +; RV64IM-NEXT: lui a5, 264 +; RV64IM-NEXT: addiw a5, a5, 1057 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1057 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1057 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 133 +; RV64IM-NEXT: mulhu a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 3 +; RV64IM-NEXT: addi a5, zero, 124 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a4, a2 +; RV64IM-NEXT: srli a4, a3, 1 +; RV64IM-NEXT: lui a5, 2675 +; RV64IM-NEXT: addiw a5, a5, -251 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1839 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 167 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1505 +; RV64IM-NEXT: mulhu a4, a4, a5 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: addi a5, zero, 98 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 8364 +; RV64IM-NEXT: addiw a4, a4, -1977 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1907 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 453 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 1213 +; RV64IM-NEXT: mulhu a4, a6, a4 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: addi a5, zero, 1003 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a4, a6, a4 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a3, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) +; RV64IM-NEXT: sh a1, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; RV32I-LABEL: fold_urem_vec_2: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: sw s4, 8(sp) +; RV32I-NEXT: sw s5, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s3, 8(a1) +; RV32I-NEXT: lhu s0, 4(a1) +; RV32I-NEXT: lhu a2, 0(a1) +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh a0, 6(s1) +; RV32I-NEXT: sh s0, 4(s1) +; RV32I-NEXT: sh s5, 2(s1) +; RV32I-NEXT: sh s4, 0(s1) +; RV32I-NEXT: lw s5, 4(sp) +; RV32I-NEXT: lw s4, 8(sp) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: fold_urem_vec_2: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 12(a1) +; RV32IM-NEXT: lhu a7, 8(a1) +; RV32IM-NEXT: lhu a4, 0(a1) +; RV32IM-NEXT: lhu a1, 4(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a2, a4, a5 +; RV32IM-NEXT: sub a3, a4, a2 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: addi a3, zero, 95 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub t0, a4, a2 +; RV32IM-NEXT: mulhu a4, a1, a5 +; RV32IM-NEXT: sub a2, a1, a4 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: mulhu a2, a7, a5 +; RV32IM-NEXT: sub a4, a7, a2 +; RV32IM-NEXT: srli a4, a4, 1 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub a2, a7, a2 +; RV32IM-NEXT: mulhu a4, a6, a5 +; RV32IM-NEXT: sub a5, a6, a4 +; RV32IM-NEXT: srli a5, a5, 1 +; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: mul a3, a4, a3 +; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh t0, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: fold_urem_vec_2: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: .cfi_def_cfa_offset 64 +; RV64I-NEXT: sd ra, 56(sp) +; RV64I-NEXT: sd s0, 48(sp) +; RV64I-NEXT: sd s1, 40(sp) +; RV64I-NEXT: sd s2, 32(sp) +; RV64I-NEXT: sd s3, 24(sp) +; RV64I-NEXT: sd s4, 16(sp) +; RV64I-NEXT: sd s5, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s3, 16(a1) +; RV64I-NEXT: lhu s0, 8(a1) +; RV64I-NEXT: lhu a2, 0(a1) +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh a0, 6(s1) +; RV64I-NEXT: sh s0, 4(s1) +; RV64I-NEXT: sh s5, 2(s1) +; RV64I-NEXT: sh s4, 0(s1) +; RV64I-NEXT: ld s5, 8(sp) +; RV64I-NEXT: ld s4, 16(sp) +; RV64I-NEXT: ld s3, 24(sp) +; RV64I-NEXT: ld s2, 32(sp) +; RV64I-NEXT: ld s1, 40(sp) +; RV64I-NEXT: ld s0, 48(sp) +; RV64I-NEXT: ld ra, 56(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: fold_urem_vec_2: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 24(a1) +; RV64IM-NEXT: lhu a7, 16(a1) +; RV64IM-NEXT: lhu a4, 8(a1) +; RV64IM-NEXT: lhu a1, 0(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a2, a1, a5 +; RV64IM-NEXT: sub a3, a1, a2 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: addi a3, zero, 95 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub t0, a1, a2 +; RV64IM-NEXT: mulhu a2, a4, a5 +; RV64IM-NEXT: sub a1, a4, a2 +; RV64IM-NEXT: srli a1, a1, 1 +; RV64IM-NEXT: add a1, a1, a2 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: mul a1, a1, a3 +; RV64IM-NEXT: sub a1, a4, a1 +; RV64IM-NEXT: mulhu a2, a7, a5 +; RV64IM-NEXT: sub a4, a7, a2 +; RV64IM-NEXT: srli a4, a4, 1 +; RV64IM-NEXT: add a2, a4, a2 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: sub a2, a7, a2 +; RV64IM-NEXT: mulhu a4, a6, a5 +; RV64IM-NEXT: sub a5, a6, a4 +; RV64IM-NEXT: srli a5, a5, 1 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: mul a3, a4, a3 +; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: sh a3, 6(a0) +; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 2(a0) +; RV64IM-NEXT: sh t0, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; RV32I-LABEL: combine_urem_udiv: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lhu s2, 0(a1) +; RV32I-NEXT: lhu s3, 4(a1) +; RV32I-NEXT: lhu s4, 8(a1) +; RV32I-NEXT: lhu s1, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s5, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s8, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: mv s9, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __udivsi3 +; RV32I-NEXT: add a0, s8, a0 +; RV32I-NEXT: add a1, s7, s1 +; RV32I-NEXT: add a2, s6, s4 +; RV32I-NEXT: add a3, s5, s9 +; RV32I-NEXT: sh a3, 6(s0) +; RV32I-NEXT: sh a2, 4(s0) +; RV32I-NEXT: sh a1, 2(s0) +; RV32I-NEXT: sh a0, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: combine_urem_udiv: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 0(a1) +; RV32IM-NEXT: lhu a7, 4(a1) +; RV32IM-NEXT: lhu a4, 12(a1) +; RV32IM-NEXT: lhu a1, 8(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a2, a4, a5 +; RV32IM-NEXT: sub a3, a4, a2 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: srli t3, a2, 6 +; RV32IM-NEXT: addi t0, zero, 95 +; RV32IM-NEXT: mul a3, t3, t0 +; RV32IM-NEXT: sub t1, a4, a3 +; RV32IM-NEXT: mulhu a4, a1, a5 +; RV32IM-NEXT: sub a3, a1, a4 +; RV32IM-NEXT: srli a3, a3, 1 +; RV32IM-NEXT: add a3, a3, a4 +; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: mul a4, a3, t0 +; RV32IM-NEXT: sub t2, a1, a4 +; RV32IM-NEXT: mulhu a4, a7, a5 +; RV32IM-NEXT: sub a1, a7, a4 +; RV32IM-NEXT: srli a1, a1, 1 +; RV32IM-NEXT: add a1, a1, a4 +; RV32IM-NEXT: srli a1, a1, 6 +; RV32IM-NEXT: mul a4, a1, t0 +; RV32IM-NEXT: sub a4, a7, a4 +; RV32IM-NEXT: mulhu a5, a6, a5 +; RV32IM-NEXT: sub a2, a6, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: mul a5, a2, t0 +; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: add a1, a4, a1 +; RV32IM-NEXT: add a3, t2, a3 +; RV32IM-NEXT: add a4, t1, t3 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) +; RV32IM-NEXT: sh a1, 2(a0) +; RV32IM-NEXT: sh a2, 0(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: combine_urem_udiv: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -96 +; RV64I-NEXT: .cfi_def_cfa_offset 96 +; RV64I-NEXT: sd ra, 88(sp) +; RV64I-NEXT: sd s0, 80(sp) +; RV64I-NEXT: sd s1, 72(sp) +; RV64I-NEXT: sd s2, 64(sp) +; RV64I-NEXT: sd s3, 56(sp) +; RV64I-NEXT: sd s4, 48(sp) +; RV64I-NEXT: sd s5, 40(sp) +; RV64I-NEXT: sd s6, 32(sp) +; RV64I-NEXT: sd s7, 24(sp) +; RV64I-NEXT: sd s8, 16(sp) +; RV64I-NEXT: sd s9, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: .cfi_offset s4, -48 +; RV64I-NEXT: .cfi_offset s5, -56 +; RV64I-NEXT: .cfi_offset s6, -64 +; RV64I-NEXT: .cfi_offset s7, -72 +; RV64I-NEXT: .cfi_offset s8, -80 +; RV64I-NEXT: .cfi_offset s9, -88 +; RV64I-NEXT: lhu s2, 0(a1) +; RV64I-NEXT: lhu s3, 8(a1) +; RV64I-NEXT: lhu s4, 16(a1) +; RV64I-NEXT: lhu s1, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s5, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s6, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s7, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s8, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: mv s9, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s4 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: mv s4, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s3 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __udivdi3 +; RV64I-NEXT: add a0, s8, a0 +; RV64I-NEXT: add a1, s7, s1 +; RV64I-NEXT: add a2, s6, s4 +; RV64I-NEXT: add a3, s5, s9 +; RV64I-NEXT: sh a3, 6(s0) +; RV64I-NEXT: sh a2, 4(s0) +; RV64I-NEXT: sh a1, 2(s0) +; RV64I-NEXT: sh a0, 0(s0) +; RV64I-NEXT: ld s9, 8(sp) +; RV64I-NEXT: ld s8, 16(sp) +; RV64I-NEXT: ld s7, 24(sp) +; RV64I-NEXT: ld s6, 32(sp) +; RV64I-NEXT: ld s5, 40(sp) +; RV64I-NEXT: ld s4, 48(sp) +; RV64I-NEXT: ld s3, 56(sp) +; RV64I-NEXT: ld s2, 64(sp) +; RV64I-NEXT: ld s1, 72(sp) +; RV64I-NEXT: ld s0, 80(sp) +; RV64I-NEXT: ld ra, 88(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: .cfi_restore s4 +; RV64I-NEXT: .cfi_restore s5 +; RV64I-NEXT: .cfi_restore s6 +; RV64I-NEXT: .cfi_restore s7 +; RV64I-NEXT: .cfi_restore s8 +; RV64I-NEXT: .cfi_restore s9 +; RV64I-NEXT: addi sp, sp, 96 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: combine_urem_udiv: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 0(a1) +; RV64IM-NEXT: lhu a7, 8(a1) +; RV64IM-NEXT: lhu a4, 16(a1) +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a2, a1, a5 +; RV64IM-NEXT: sub a3, a1, a2 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: srli t3, a2, 6 +; RV64IM-NEXT: addi t0, zero, 95 +; RV64IM-NEXT: mul a3, t3, t0 +; RV64IM-NEXT: sub t1, a1, a3 +; RV64IM-NEXT: mulhu a3, a4, a5 +; RV64IM-NEXT: sub a1, a4, a3 +; RV64IM-NEXT: srli a1, a1, 1 +; RV64IM-NEXT: add a1, a1, a3 +; RV64IM-NEXT: srli a1, a1, 6 +; RV64IM-NEXT: mul a3, a1, t0 +; RV64IM-NEXT: sub t2, a4, a3 +; RV64IM-NEXT: mulhu a4, a7, a5 +; RV64IM-NEXT: sub a3, a7, a4 +; RV64IM-NEXT: srli a3, a3, 1 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: srli a3, a3, 6 +; RV64IM-NEXT: mul a4, a3, t0 +; RV64IM-NEXT: sub a4, a7, a4 +; RV64IM-NEXT: mulhu a5, a6, a5 +; RV64IM-NEXT: sub a2, a6, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: mul a5, a2, t0 +; RV64IM-NEXT: sub a5, a6, a5 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: add a1, t2, a1 +; RV64IM-NEXT: add a4, t1, t3 +; RV64IM-NEXT: sh a4, 6(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_urem_power_of_two: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lhu s2, 8(a1) +; RV32I-NEXT: lhu s3, 4(a1) +; RV32I-NEXT: lhu s1, 0(a1) +; RV32I-NEXT: lhu a2, 12(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 95 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: andi a1, s1, 63 +; RV32I-NEXT: andi a2, s3, 31 +; RV32I-NEXT: andi a3, s2, 7 +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh a3, 4(s0) +; RV32I-NEXT: sh a2, 2(s0) +; RV32I-NEXT: sh a1, 0(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_power_of_two: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a6, 8(a1) +; RV32IM-NEXT: lhu a3, 4(a1) +; RV32IM-NEXT: lhu a4, 12(a1) +; RV32IM-NEXT: lhu a1, 0(a1) +; RV32IM-NEXT: lui a5, 364242 +; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: mulhu a5, a4, a5 +; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: srli a2, a2, 6 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a4, a2 +; RV32IM-NEXT: andi a1, a1, 63 +; RV32IM-NEXT: andi a3, a3, 31 +; RV32IM-NEXT: andi a4, a6, 7 +; RV32IM-NEXT: sh a4, 4(a0) +; RV32IM-NEXT: sh a3, 2(a0) +; RV32IM-NEXT: sh a1, 0(a0) +; RV32IM-NEXT: sh a2, 6(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_power_of_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lhu s2, 16(a1) +; RV64I-NEXT: lhu s3, 8(a1) +; RV64I-NEXT: lhu s1, 0(a1) +; RV64I-NEXT: lhu a2, 24(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 95 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: andi a1, s1, 63 +; RV64I-NEXT: andi a2, s3, 31 +; RV64I-NEXT: andi a3, s2, 7 +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh a3, 4(s0) +; RV64I-NEXT: sh a2, 2(s0) +; RV64I-NEXT: sh a1, 0(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_power_of_two: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a6, 16(a1) +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a4, 0(a1) +; RV64IM-NEXT: lhu a1, 24(a1) +; RV64IM-NEXT: lui a5, 1423 +; RV64IM-NEXT: addiw a5, a5, -733 +; RV64IM-NEXT: slli a5, a5, 15 +; RV64IM-NEXT: addi a5, a5, 1035 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, -1811 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 561 +; RV64IM-NEXT: mulhu a5, a1, a5 +; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 +; RV64IM-NEXT: srli a2, a2, 6 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a1, a1, a2 +; RV64IM-NEXT: andi a2, a4, 63 +; RV64IM-NEXT: andi a3, a3, 31 +; RV64IM-NEXT: andi a4, a6, 7 +; RV64IM-NEXT: sh a4, 4(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a2, 0(a0) +; RV64IM-NEXT: sh a1, 6(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { +; RV32I-LABEL: dont_fold_urem_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: .cfi_def_cfa_offset 32 +; RV32I-NEXT: sw ra, 28(sp) +; RV32I-NEXT: sw s0, 24(sp) +; RV32I-NEXT: sw s1, 20(sp) +; RV32I-NEXT: sw s2, 16(sp) +; RV32I-NEXT: sw s3, 12(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: lhu s2, 12(a1) +; RV32I-NEXT: lhu s1, 8(a1) +; RV32I-NEXT: lhu a2, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a1, zero, 654 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: addi a1, zero, 23 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a1, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call __umodsi3 +; RV32I-NEXT: sh zero, 0(s0) +; RV32I-NEXT: sh a0, 6(s0) +; RV32I-NEXT: sh s1, 4(s0) +; RV32I-NEXT: sh s3, 2(s0) +; RV32I-NEXT: lw s3, 12(sp) +; RV32I-NEXT: lw s2, 16(sp) +; RV32I-NEXT: lw s1, 20(sp) +; RV32I-NEXT: lw s0, 24(sp) +; RV32I-NEXT: lw ra, 28(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_one: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lhu a2, 4(a1) +; RV32IM-NEXT: lhu a3, 12(a1) +; RV32IM-NEXT: lhu a1, 8(a1) +; RV32IM-NEXT: srli a4, a2, 1 +; RV32IM-NEXT: lui a5, 820904 +; RV32IM-NEXT: addi a5, a5, -1903 +; RV32IM-NEXT: mulhu a4, a4, a5 +; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: addi a5, zero, 654 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a2, a2, a4 +; RV32IM-NEXT: lui a4, 729444 +; RV32IM-NEXT: addi a4, a4, 713 +; RV32IM-NEXT: mulhu a4, a1, a4 +; RV32IM-NEXT: srli a4, a4, 4 +; RV32IM-NEXT: addi a5, zero, 23 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a1, a1, a4 +; RV32IM-NEXT: lui a4, 395996 +; RV32IM-NEXT: addi a4, a4, -2009 +; RV32IM-NEXT: mulhu a4, a3, a4 +; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: lui a5, 1 +; RV32IM-NEXT: addi a5, a5, 1327 +; RV32IM-NEXT: mul a4, a4, a5 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: sh zero, 0(a0) +; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a1, 4(a0) +; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: lhu s2, 24(a1) +; RV64I-NEXT: lhu s1, 16(a1) +; RV64I-NEXT: lhu a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sh zero, 0(s0) +; RV64I-NEXT: sh a0, 6(s0) +; RV64I-NEXT: sh s1, 4(s0) +; RV64I-NEXT: sh s3, 2(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_one: +; RV64IM: # %bb.0: +; RV64IM-NEXT: lhu a2, 24(a1) +; RV64IM-NEXT: lhu a3, 8(a1) +; RV64IM-NEXT: lhu a1, 16(a1) +; RV64IM-NEXT: lui a4, 3206 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, 713 +; RV64IM-NEXT: mulhu a4, a1, a4 +; RV64IM-NEXT: sub a5, a1, a4 +; RV64IM-NEXT: srli a5, a5, 1 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: srli a4, a3, 1 +; RV64IM-NEXT: lui a5, 6413 +; RV64IM-NEXT: addiw a5, a5, 1265 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1027 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1077 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 965 +; RV64IM-NEXT: mulhu a4, a4, a5 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 1044567 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, -861 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -179 +; RV64IM-NEXT: mulhu a4, a2, a4 +; RV64IM-NEXT: srli a4, a4, 12 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sh zero, 0(a0) +; RV64IM-NEXT: sh a2, 6(a0) +; RV64IM-NEXT: sh a3, 2(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; RV32I-LABEL: dont_fold_urem_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: .cfi_def_cfa_offset 48 +; RV32I-NEXT: sw ra, 44(sp) +; RV32I-NEXT: sw s0, 40(sp) +; RV32I-NEXT: sw s1, 36(sp) +; RV32I-NEXT: sw s2, 32(sp) +; RV32I-NEXT: sw s3, 28(sp) +; RV32I-NEXT: sw s4, 24(sp) +; RV32I-NEXT: sw s5, 20(sp) +; RV32I-NEXT: sw s6, 16(sp) +; RV32I-NEXT: sw s7, 12(sp) +; RV32I-NEXT: sw s8, 8(sp) +; RV32I-NEXT: sw s9, 4(sp) +; RV32I-NEXT: .cfi_offset ra, -4 +; RV32I-NEXT: .cfi_offset s0, -8 +; RV32I-NEXT: .cfi_offset s1, -12 +; RV32I-NEXT: .cfi_offset s2, -16 +; RV32I-NEXT: .cfi_offset s3, -20 +; RV32I-NEXT: .cfi_offset s4, -24 +; RV32I-NEXT: .cfi_offset s5, -28 +; RV32I-NEXT: .cfi_offset s6, -32 +; RV32I-NEXT: .cfi_offset s7, -36 +; RV32I-NEXT: .cfi_offset s8, -40 +; RV32I-NEXT: .cfi_offset s9, -44 +; RV32I-NEXT: lw s2, 24(a1) +; RV32I-NEXT: lw s3, 28(a1) +; RV32I-NEXT: lw s4, 16(a1) +; RV32I-NEXT: lw s5, 20(a1) +; RV32I-NEXT: lw s6, 8(a1) +; RV32I-NEXT: lw s1, 12(a1) +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: addi a2, zero, 1 +; RV32I-NEXT: mv a0, a3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: mv s7, a0 +; RV32I-NEXT: mv s8, a1 +; RV32I-NEXT: addi a2, zero, 654 +; RV32I-NEXT: mv a0, s6 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: mv s9, a1 +; RV32I-NEXT: addi a2, zero, 23 +; RV32I-NEXT: mv a0, s4 +; RV32I-NEXT: mv a1, s5 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a2, a0, 1327 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a1, s3 +; RV32I-NEXT: mv a3, zero +; RV32I-NEXT: call __umoddi3 +; RV32I-NEXT: sw a1, 28(s0) +; RV32I-NEXT: sw a0, 24(s0) +; RV32I-NEXT: sw s1, 20(s0) +; RV32I-NEXT: sw s4, 16(s0) +; RV32I-NEXT: sw s9, 12(s0) +; RV32I-NEXT: sw s6, 8(s0) +; RV32I-NEXT: sw s8, 4(s0) +; RV32I-NEXT: sw s7, 0(s0) +; RV32I-NEXT: lw s9, 4(sp) +; RV32I-NEXT: lw s8, 8(sp) +; RV32I-NEXT: lw s7, 12(sp) +; RV32I-NEXT: lw s6, 16(sp) +; RV32I-NEXT: lw s5, 20(sp) +; RV32I-NEXT: lw s4, 24(sp) +; RV32I-NEXT: lw s3, 28(sp) +; RV32I-NEXT: lw s2, 32(sp) +; RV32I-NEXT: lw s1, 36(sp) +; RV32I-NEXT: lw s0, 40(sp) +; RV32I-NEXT: lw ra, 44(sp) +; RV32I-NEXT: .cfi_restore ra +; RV32I-NEXT: .cfi_restore s0 +; RV32I-NEXT: .cfi_restore s1 +; RV32I-NEXT: .cfi_restore s2 +; RV32I-NEXT: .cfi_restore s3 +; RV32I-NEXT: .cfi_restore s4 +; RV32I-NEXT: .cfi_restore s5 +; RV32I-NEXT: .cfi_restore s6 +; RV32I-NEXT: .cfi_restore s7 +; RV32I-NEXT: .cfi_restore s8 +; RV32I-NEXT: .cfi_restore s9 +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: .cfi_def_cfa_offset 0 +; RV32I-NEXT: ret +; +; RV32IM-LABEL: dont_fold_urem_i64: +; RV32IM: # %bb.0: +; RV32IM-NEXT: addi sp, sp, -48 +; RV32IM-NEXT: .cfi_def_cfa_offset 48 +; RV32IM-NEXT: sw ra, 44(sp) +; RV32IM-NEXT: sw s0, 40(sp) +; RV32IM-NEXT: sw s1, 36(sp) +; RV32IM-NEXT: sw s2, 32(sp) +; RV32IM-NEXT: sw s3, 28(sp) +; RV32IM-NEXT: sw s4, 24(sp) +; RV32IM-NEXT: sw s5, 20(sp) +; RV32IM-NEXT: sw s6, 16(sp) +; RV32IM-NEXT: sw s7, 12(sp) +; RV32IM-NEXT: sw s8, 8(sp) +; RV32IM-NEXT: sw s9, 4(sp) +; RV32IM-NEXT: .cfi_offset ra, -4 +; RV32IM-NEXT: .cfi_offset s0, -8 +; RV32IM-NEXT: .cfi_offset s1, -12 +; RV32IM-NEXT: .cfi_offset s2, -16 +; RV32IM-NEXT: .cfi_offset s3, -20 +; RV32IM-NEXT: .cfi_offset s4, -24 +; RV32IM-NEXT: .cfi_offset s5, -28 +; RV32IM-NEXT: .cfi_offset s6, -32 +; RV32IM-NEXT: .cfi_offset s7, -36 +; RV32IM-NEXT: .cfi_offset s8, -40 +; RV32IM-NEXT: .cfi_offset s9, -44 +; RV32IM-NEXT: lw s2, 24(a1) +; RV32IM-NEXT: lw s3, 28(a1) +; RV32IM-NEXT: lw s4, 16(a1) +; RV32IM-NEXT: lw s5, 20(a1) +; RV32IM-NEXT: lw s6, 8(a1) +; RV32IM-NEXT: lw s1, 12(a1) +; RV32IM-NEXT: lw a3, 0(a1) +; RV32IM-NEXT: lw a1, 4(a1) +; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: addi a2, zero, 1 +; RV32IM-NEXT: mv a0, a3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: mv s7, a0 +; RV32IM-NEXT: mv s8, a1 +; RV32IM-NEXT: addi a2, zero, 654 +; RV32IM-NEXT: mv a0, s6 +; RV32IM-NEXT: mv a1, s1 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: mv s9, a1 +; RV32IM-NEXT: addi a2, zero, 23 +; RV32IM-NEXT: mv a0, s4 +; RV32IM-NEXT: mv a1, s5 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: mv s4, a0 +; RV32IM-NEXT: mv s1, a1 +; RV32IM-NEXT: lui a0, 1 +; RV32IM-NEXT: addi a2, a0, 1327 +; RV32IM-NEXT: mv a0, s2 +; RV32IM-NEXT: mv a1, s3 +; RV32IM-NEXT: mv a3, zero +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: sw a1, 28(s0) +; RV32IM-NEXT: sw a0, 24(s0) +; RV32IM-NEXT: sw s1, 20(s0) +; RV32IM-NEXT: sw s4, 16(s0) +; RV32IM-NEXT: sw s9, 12(s0) +; RV32IM-NEXT: sw s6, 8(s0) +; RV32IM-NEXT: sw s8, 4(s0) +; RV32IM-NEXT: sw s7, 0(s0) +; RV32IM-NEXT: lw s9, 4(sp) +; RV32IM-NEXT: lw s8, 8(sp) +; RV32IM-NEXT: lw s7, 12(sp) +; RV32IM-NEXT: lw s6, 16(sp) +; RV32IM-NEXT: lw s5, 20(sp) +; RV32IM-NEXT: lw s4, 24(sp) +; RV32IM-NEXT: lw s3, 28(sp) +; RV32IM-NEXT: lw s2, 32(sp) +; RV32IM-NEXT: lw s1, 36(sp) +; RV32IM-NEXT: lw s0, 40(sp) +; RV32IM-NEXT: lw ra, 44(sp) +; RV32IM-NEXT: .cfi_restore ra +; RV32IM-NEXT: .cfi_restore s0 +; RV32IM-NEXT: .cfi_restore s1 +; RV32IM-NEXT: .cfi_restore s2 +; RV32IM-NEXT: .cfi_restore s3 +; RV32IM-NEXT: .cfi_restore s4 +; RV32IM-NEXT: .cfi_restore s5 +; RV32IM-NEXT: .cfi_restore s6 +; RV32IM-NEXT: .cfi_restore s7 +; RV32IM-NEXT: .cfi_restore s8 +; RV32IM-NEXT: .cfi_restore s9 +; RV32IM-NEXT: addi sp, sp, 48 +; RV32IM-NEXT: .cfi_def_cfa_offset 0 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: dont_fold_urem_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -48 +; RV64I-NEXT: .cfi_def_cfa_offset 48 +; RV64I-NEXT: sd ra, 40(sp) +; RV64I-NEXT: sd s0, 32(sp) +; RV64I-NEXT: sd s1, 24(sp) +; RV64I-NEXT: sd s2, 16(sp) +; RV64I-NEXT: sd s3, 8(sp) +; RV64I-NEXT: .cfi_offset ra, -8 +; RV64I-NEXT: .cfi_offset s0, -16 +; RV64I-NEXT: .cfi_offset s1, -24 +; RV64I-NEXT: .cfi_offset s2, -32 +; RV64I-NEXT: .cfi_offset s3, -40 +; RV64I-NEXT: ld s2, 24(a1) +; RV64I-NEXT: ld s1, 16(a1) +; RV64I-NEXT: ld a2, 8(a1) +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: addi a1, zero, 654 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: addi a1, zero, 23 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a1, a0, 1327 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call __umoddi3 +; RV64I-NEXT: sd zero, 0(s0) +; RV64I-NEXT: sd a0, 24(s0) +; RV64I-NEXT: sd s1, 16(s0) +; RV64I-NEXT: sd s3, 8(s0) +; RV64I-NEXT: ld s3, 8(sp) +; RV64I-NEXT: ld s2, 16(sp) +; RV64I-NEXT: ld s1, 24(sp) +; RV64I-NEXT: ld s0, 32(sp) +; RV64I-NEXT: ld ra, 40(sp) +; RV64I-NEXT: .cfi_restore ra +; RV64I-NEXT: .cfi_restore s0 +; RV64I-NEXT: .cfi_restore s1 +; RV64I-NEXT: .cfi_restore s2 +; RV64I-NEXT: .cfi_restore s3 +; RV64I-NEXT: addi sp, sp, 48 +; RV64I-NEXT: .cfi_def_cfa_offset 0 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: dont_fold_urem_i64: +; RV64IM: # %bb.0: +; RV64IM-NEXT: ld a2, 24(a1) +; RV64IM-NEXT: ld a3, 8(a1) +; RV64IM-NEXT: ld a1, 16(a1) +; RV64IM-NEXT: lui a4, 3206 +; RV64IM-NEXT: addiw a4, a4, -1781 +; RV64IM-NEXT: slli a4, a4, 13 +; RV64IM-NEXT: addi a4, a4, 1069 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -1959 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, 713 +; RV64IM-NEXT: mulhu a4, a1, a4 +; RV64IM-NEXT: sub a5, a1, a4 +; RV64IM-NEXT: srli a5, a5, 1 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: srli a4, a4, 4 +; RV64IM-NEXT: addi a5, zero, 23 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a1, a1, a4 +; RV64IM-NEXT: srli a4, a3, 1 +; RV64IM-NEXT: lui a5, 6413 +; RV64IM-NEXT: addiw a5, a5, 1265 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1027 +; RV64IM-NEXT: slli a5, a5, 13 +; RV64IM-NEXT: addi a5, a5, 1077 +; RV64IM-NEXT: slli a5, a5, 12 +; RV64IM-NEXT: addi a5, a5, 965 +; RV64IM-NEXT: mulhu a4, a4, a5 +; RV64IM-NEXT: srli a4, a4, 7 +; RV64IM-NEXT: addi a5, zero, 654 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a3, a3, a4 +; RV64IM-NEXT: lui a4, 1044567 +; RV64IM-NEXT: addiw a4, a4, -575 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, 883 +; RV64IM-NEXT: slli a4, a4, 14 +; RV64IM-NEXT: addi a4, a4, -861 +; RV64IM-NEXT: slli a4, a4, 12 +; RV64IM-NEXT: addi a4, a4, -179 +; RV64IM-NEXT: mulhu a4, a2, a4 +; RV64IM-NEXT: srli a4, a4, 12 +; RV64IM-NEXT: lui a5, 1 +; RV64IM-NEXT: addiw a5, a5, 1327 +; RV64IM-NEXT: mul a4, a4, a5 +; RV64IM-NEXT: sub a2, a2, a4 +; RV64IM-NEXT: sd zero, 0(a0) +; RV64IM-NEXT: sd a2, 24(a0) +; RV64IM-NEXT: sd a3, 8(a0) +; RV64IM-NEXT: sd a1, 16(a0) +; RV64IM-NEXT: .cfi_def_cfa_offset 0 +; RV64IM-NEXT: ret + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-add-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-add-01.ll index 57f75cc871d715..d95ab9331e88f3 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-add-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-add-01.ll @@ -8,19 +8,19 @@ declare float @foo() declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) ; Check register addition. -define float @f1(float %f1, float %f2) { +define float @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: aebr %f0, %f2 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.fadd.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the low end of the AEB range. -define float @f2(float %f1, float *%ptr) { +define float @f2(float %f1, float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: aeb %f0, 0(%r2) ; CHECK: br %r14 @@ -28,12 +28,12 @@ define float @f2(float %f1, float *%ptr) { %res = call float @llvm.experimental.constrained.fadd.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the high end of the aligned AEB range. -define float @f3(float %f1, float *%base) { +define float @f3(float %f1, float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: aeb %f0, 4092(%r2) ; CHECK: br %r14 @@ -42,13 +42,13 @@ define float @f3(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fadd.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define float @f4(float %f1, float *%base) { +define float @f4(float %f1, float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: aeb %f0, 0(%r2) @@ -58,12 +58,12 @@ define float @f4(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fadd.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check negative displacements, which also need separate address logic. -define float @f5(float %f1, float *%base) { +define float @f5(float %f1, float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: aeb %f0, 0(%r2) @@ -73,12 +73,12 @@ define float @f5(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fadd.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that AEB allows indices. -define float @f6(float %f1, float *%base, i64 %index) { +define float @f6(float %f1, float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: aeb %f0, 400(%r1,%r2) @@ -89,12 +89,12 @@ define float @f6(float %f1, float *%base, i64 %index) { %res = call float @llvm.experimental.constrained.fadd.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that additions of spilled values can use AEB rather than AEBR. -define float @f7(float *%ptr0) { +define float @f7(float *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: aeb %f0, 16{{[04]}}(%r15) @@ -122,52 +122,54 @@ define float @f7(float *%ptr0) { %val9 = load float, float *%ptr9 %val10 = load float, float *%ptr10 - %ret = call float @foo() + %ret = call float @foo() #0 %add0 = call float @llvm.experimental.constrained.fadd.f32( float %ret, float %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add1 = call float @llvm.experimental.constrained.fadd.f32( float %add0, float %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add2 = call float @llvm.experimental.constrained.fadd.f32( float %add1, float %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add3 = call float @llvm.experimental.constrained.fadd.f32( float %add2, float %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add4 = call float @llvm.experimental.constrained.fadd.f32( float %add3, float %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add5 = call float @llvm.experimental.constrained.fadd.f32( float %add4, float %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add6 = call float @llvm.experimental.constrained.fadd.f32( float %add5, float %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add7 = call float @llvm.experimental.constrained.fadd.f32( float %add6, float %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add8 = call float @llvm.experimental.constrained.fadd.f32( float %add7, float %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add9 = call float @llvm.experimental.constrained.fadd.f32( float %add8, float %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add10 = call float @llvm.experimental.constrained.fadd.f32( float %add9, float %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %add10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-add-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-add-02.ll index 739290969af5d5..2693aef7c7ff46 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-add-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-add-02.ll @@ -7,19 +7,19 @@ declare double @foo() declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) ; Check register addition. -define double @f1(double %f1, double %f2) { +define double @f1(double %f1, double %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: adbr %f0, %f2 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.fadd.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the low end of the ADB range. -define double @f2(double %f1, double *%ptr) { +define double @f2(double %f1, double *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: adb %f0, 0(%r2) ; CHECK: br %r14 @@ -27,12 +27,12 @@ define double @f2(double %f1, double *%ptr) { %res = call double @llvm.experimental.constrained.fadd.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the high end of the aligned ADB range. -define double @f3(double %f1, double *%base) { +define double @f3(double %f1, double *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: adb %f0, 4088(%r2) ; CHECK: br %r14 @@ -41,13 +41,13 @@ define double @f3(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fadd.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the next doubleword up, which needs separate address logic. ; Other sequences besides this one would be OK. -define double @f4(double %f1, double *%base) { +define double @f4(double %f1, double *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: adb %f0, 0(%r2) @@ -57,12 +57,12 @@ define double @f4(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fadd.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check negative displacements, which also need separate address logic. -define double @f5(double %f1, double *%base) { +define double @f5(double %f1, double *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -8 ; CHECK: adb %f0, 0(%r2) @@ -72,12 +72,12 @@ define double @f5(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fadd.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that ADB allows indices. -define double @f6(double %f1, double *%base, i64 %index) { +define double @f6(double %f1, double *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: adb %f0, 800(%r1,%r2) @@ -88,12 +88,12 @@ define double @f6(double %f1, double *%base, i64 %index) { %res = call double @llvm.experimental.constrained.fadd.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that additions of spilled values can use ADB rather than ADBR. -define double @f7(double *%ptr0) { +define double @f7(double *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: adb %f0, 160(%r15) @@ -121,52 +121,54 @@ define double @f7(double *%ptr0) { %val9 = load double, double *%ptr9 %val10 = load double, double *%ptr10 - %ret = call double @foo() + %ret = call double @foo() #0 %add0 = call double @llvm.experimental.constrained.fadd.f64( double %ret, double %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add1 = call double @llvm.experimental.constrained.fadd.f64( double %add0, double %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add2 = call double @llvm.experimental.constrained.fadd.f64( double %add1, double %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add3 = call double @llvm.experimental.constrained.fadd.f64( double %add2, double %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add4 = call double @llvm.experimental.constrained.fadd.f64( double %add3, double %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add5 = call double @llvm.experimental.constrained.fadd.f64( double %add4, double %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add6 = call double @llvm.experimental.constrained.fadd.f64( double %add5, double %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add7 = call double @llvm.experimental.constrained.fadd.f64( double %add6, double %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add8 = call double @llvm.experimental.constrained.fadd.f64( double %add7, double %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add9 = call double @llvm.experimental.constrained.fadd.f64( double %add8, double %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %add10 = call double @llvm.experimental.constrained.fadd.f64( double %add9, double %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %add10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-add-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-add-03.ll index d2535c9f0b05bf..0aeef7c25453c0 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-add-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-add-03.ll @@ -5,7 +5,7 @@ declare fp128 @llvm.experimental.constrained.fadd.f128(fp128, fp128, metadata, metadata) ; There is no memory form of 128-bit addition. -define void @f1(fp128 *%ptr, float %f2) { +define void @f1(fp128 *%ptr, float %f2) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: lxebr %f0, %f0 ; CHECK-DAG: ld %f1, 0(%r2) @@ -19,7 +19,7 @@ define void @f1(fp128 *%ptr, float %f2) { %sum = call fp128 @llvm.experimental.constrained.fadd.f128( fp128 %f1, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %sum, fp128 *%ptr ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-add-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-add-04.ll index d4ec5fc6854650..98a3454a8986e9 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-add-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-add-04.ll @@ -4,7 +4,7 @@ declare fp128 @llvm.experimental.constrained.fadd.f128(fp128, fp128, metadata, metadata) -define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +define void @f1(fp128 *%ptr1, fp128 *%ptr2) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) ; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) @@ -16,7 +16,7 @@ define void @f1(fp128 *%ptr1, fp128 *%ptr2) { %sum = call fp128 @llvm.experimental.constrained.fadd.f128( fp128 %f1, fp128 %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %sum, fp128 *%ptr1 ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll b/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll index fe27b61c20bab7..4d675cba4ced7a 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-alias.ll @@ -30,7 +30,7 @@ define void @f1(float %f1, float %f2, float *%ptr1, float *%ptr2) { ret void } -define void @f2(float %f1, float %f2, float *%ptr1, float *%ptr2) { +define void @f2(float %f1, float %f2, float *%ptr1, float *%ptr2) #0 { ; CHECK-LABEL: f2: ; CHECK: sqebr ; CHECK: ste @@ -41,11 +41,11 @@ define void @f2(float %f1, float %f2, float *%ptr1, float *%ptr2) { %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( float %f1, metadata !"round.dynamic", - metadata !"fpexcept.ignore") + metadata !"fpexcept.ignore") #0 %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( float %f2, metadata !"round.dynamic", - metadata !"fpexcept.ignore") + metadata !"fpexcept.ignore") #0 store float %sqrt1, float *%ptr1 store float %sqrt2, float *%ptr2 @@ -53,7 +53,7 @@ define void @f2(float %f1, float %f2, float *%ptr1, float *%ptr2) { ret void } -define void @f3(float %f1, float %f2, float *%ptr1, float *%ptr2) { +define void @f3(float %f1, float %f2, float *%ptr1, float *%ptr2) #0 { ; CHECK-LABEL: f3: ; CHECK: sqebr ; CHECK: ste @@ -64,11 +64,11 @@ define void @f3(float %f1, float %f2, float *%ptr1, float *%ptr2) { %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( float %f1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store float %sqrt1, float *%ptr1 store float %sqrt2, float *%ptr2 @@ -98,7 +98,7 @@ define void @f4(float %f1, float %f2, float *%ptr1, float *%ptr2) { ret void } -define void @f5(float %f1, float %f2, float *%ptr1, float *%ptr2) { +define void @f5(float %f1, float %f2, float *%ptr1, float *%ptr2) #0 { ; CHECK-LABEL: f5: ; CHECK: sqebr ; CHECK: ste @@ -109,11 +109,11 @@ define void @f5(float %f1, float %f2, float *%ptr1, float *%ptr2) { %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( float %f1, metadata !"round.dynamic", - metadata !"fpexcept.ignore") + metadata !"fpexcept.ignore") #0 %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( float %f2, metadata !"round.dynamic", - metadata !"fpexcept.ignore") + metadata !"fpexcept.ignore") #0 store volatile float %sqrt1, float *%ptr1 store volatile float %sqrt2, float *%ptr2 @@ -121,7 +121,7 @@ define void @f5(float %f1, float %f2, float *%ptr1, float *%ptr2) { ret void } -define void @f6(float %f1, float %f2, float *%ptr1, float *%ptr2) { +define void @f6(float %f1, float %f2, float *%ptr1, float *%ptr2) #0 { ; CHECK-LABEL: f6: ; CHECK: sqebr ; CHECK: sqebr @@ -132,11 +132,11 @@ define void @f6(float %f1, float %f2, float *%ptr1, float *%ptr2) { %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( float %f1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store volatile float %sqrt1, float *%ptr1 store volatile float %sqrt2, float *%ptr2 @@ -166,7 +166,7 @@ define void @f7(float %f1, float %f2, float *%ptr1, float *%ptr2) { ret void } -define void @f8(float %f1, float %f2, float *%ptr1, float *%ptr2) { +define void @f8(float %f1, float %f2, float *%ptr1, float *%ptr2) #0 { ; CHECK-LABEL: f8: ; CHECK: sqebr ; CHECK: sqebr @@ -177,13 +177,13 @@ define void @f8(float %f1, float %f2, float *%ptr1, float *%ptr2) { %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( float %f1, metadata !"round.dynamic", - metadata !"fpexcept.ignore") + metadata !"fpexcept.ignore") #0 %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( float %f2, metadata !"round.dynamic", - metadata !"fpexcept.ignore") + metadata !"fpexcept.ignore") #0 - call void @llvm.s390.sfpc(i32 0) + call void @llvm.s390.sfpc(i32 0) #0 store float %sqrt1, float *%ptr1 store float %sqrt2, float *%ptr2 @@ -191,7 +191,7 @@ define void @f8(float %f1, float %f2, float *%ptr1, float *%ptr2) { ret void } -define void @f9(float %f1, float %f2, float *%ptr1, float *%ptr2) { +define void @f9(float %f1, float %f2, float *%ptr1, float *%ptr2) #0 { ; CHECK-LABEL: f9: ; CHECK: sqebr ; CHECK: sqebr @@ -202,13 +202,13 @@ define void @f9(float %f1, float %f2, float *%ptr1, float *%ptr2) { %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32( float %f1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32( float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 - call void @llvm.s390.sfpc(i32 0) + call void @llvm.s390.sfpc(i32 0) #0 store float %sqrt1, float *%ptr1 store float %sqrt2, float *%ptr2 @@ -216,3 +216,4 @@ define void @f9(float %f1, float %f2, float *%ptr1, float *%ptr2) { ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll index b20ab71808b718..45dc51ea56b169 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-01.ll @@ -13,7 +13,7 @@ declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, me declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) ; Test f64->f32. -define float @f1(double %d1, double %d2) { +define float @f1(double %d1, double %d2) #0 { ; CHECK-LABEL: f1: ; CHECK-SCALAR: ledbr %f0, %f2 ; CHECK-VECTOR: ledbra %f0, 0, %f2, 0 @@ -21,12 +21,12 @@ define float @f1(double %d1, double %d2) { %res = call float @llvm.experimental.constrained.fptrunc.f32.f64( double %d2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test f128->f32. -define float @f2(fp128 *%ptr) { +define float @f2(fp128 *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: lexbr %f0, %f0 ; CHECK: br %r14 @@ -34,13 +34,13 @@ define float @f2(fp128 *%ptr) { %res = call float @llvm.experimental.constrained.fptrunc.f32.f128( fp128 %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Make sure that we don't use %f0 as the destination of LEXBR when %f2 ; is still live. -define void @f3(float *%dst, fp128 *%ptr, float %d1, float %d2) { +define void @f3(float *%dst, fp128 *%ptr, float %d1, float %d2) #0 { ; CHECK-LABEL: f3: ; CHECK: lexbr %f1, %f1 ; CHECK: aebr %f1, %f2 @@ -50,17 +50,17 @@ define void @f3(float *%dst, fp128 *%ptr, float %d1, float %d2) { %conv = call float @llvm.experimental.constrained.fptrunc.f32.f128( fp128 %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %res = call float @llvm.experimental.constrained.fadd.f32( float %conv, float %d2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store float %res, float *%dst ret void } ; Test f128->f64. -define double @f4(fp128 *%ptr) { +define double @f4(fp128 *%ptr) #0 { ; CHECK-LABEL: f4: ; CHECK: ldxbr %f0, %f0 ; CHECK: br %r14 @@ -68,12 +68,12 @@ define double @f4(fp128 *%ptr) { %res = call double @llvm.experimental.constrained.fptrunc.f64.f128( fp128 %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Like f3, but for f128->f64. -define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) { +define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) #0 { ; CHECK-LABEL: f5: ; CHECK: ldxbr %f1, %f1 ; CHECK-SCALAR: adbr %f1, %f2 @@ -85,11 +85,13 @@ define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) { %conv = call double @llvm.experimental.constrained.fptrunc.f64.f128( fp128 %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %res = call double @llvm.experimental.constrained.fadd.f64( double %conv, double %d2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store double %res, double *%dst ret void } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll index 0f24b91e26621d..4cada62d003294 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-02.ll @@ -5,41 +5,41 @@ declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) ; Check register extension. -define double @f1(float %val) { +define double @f1(float %val) #0 { ; CHECK-LABEL: f1: ; CHECK: ldebr %f0, %f0 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the low end of the LDEB range. -define double @f2(float *%ptr) { +define double @f2(float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: ldeb %f0, 0(%r2) ; CHECK: br %r14 %val = load float, float *%ptr %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the high end of the aligned LDEB range. -define double @f3(float *%base) { +define double @f3(float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: ldeb %f0, 4092(%r2) ; CHECK: br %r14 %ptr = getelementptr float, float *%base, i64 1023 %val = load float, float *%ptr %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define double @f4(float *%base) { +define double @f4(float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: ldeb %f0, 0(%r2) @@ -47,12 +47,12 @@ define double @f4(float *%base) { %ptr = getelementptr float, float *%base, i64 1024 %val = load float, float *%ptr %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check negative displacements, which also need separate address logic. -define double @f5(float *%base) { +define double @f5(float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: ldeb %f0, 0(%r2) @@ -60,12 +60,12 @@ define double @f5(float *%base) { %ptr = getelementptr float, float *%base, i64 -1 %val = load float, float *%ptr %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that LDEB allows indices. -define double @f6(float *%base, i64 %index) { +define double @f6(float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: ldeb %f0, 400(%r1,%r2) @@ -74,7 +74,8 @@ define double @f6(float *%base, i64 %index) { %ptr2 = getelementptr float, float *%ptr1, i64 100 %val = load float, float *%ptr2 %res = call double @llvm.experimental.constrained.fpext.f64.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll index b3fbac975a1b41..7a8a7a88e28f38 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-03.ll @@ -5,20 +5,20 @@ declare fp128 @llvm.experimental.constrained.fpext.f128.f32(float, metadata) ; Check register extension. -define void @f1(fp128 *%dst, float %val) { +define void @f1(fp128 *%dst, float %val) #0 { ; CHECK-LABEL: f1: ; CHECK: lxebr %f0, %f0 ; CHECK: std %f0, 0(%r2) ; CHECK: std %f2, 8(%r2) ; CHECK: br %r14 %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the low end of the LXEB range. -define void @f2(fp128 *%dst, float *%ptr) { +define void @f2(fp128 *%dst, float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: lxeb %f0, 0(%r3) ; CHECK: std %f0, 0(%r2) @@ -26,13 +26,13 @@ define void @f2(fp128 *%dst, float *%ptr) { ; CHECK: br %r14 %val = load float, float *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the high end of the aligned LXEB range. -define void @f3(fp128 *%dst, float *%base) { +define void @f3(fp128 *%dst, float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: lxeb %f0, 4092(%r3) ; CHECK: std %f0, 0(%r2) @@ -41,14 +41,14 @@ define void @f3(fp128 *%dst, float *%base) { %ptr = getelementptr float, float *%base, i64 1023 %val = load float, float *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define void @f4(fp128 *%dst, float *%base) { +define void @f4(fp128 *%dst, float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r3, 4096 ; CHECK: lxeb %f0, 0(%r3) @@ -58,13 +58,13 @@ define void @f4(fp128 *%dst, float *%base) { %ptr = getelementptr float, float *%base, i64 1024 %val = load float, float *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check negative displacements, which also need separate address logic. -define void @f5(fp128 *%dst, float *%base) { +define void @f5(fp128 *%dst, float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r3, -4 ; CHECK: lxeb %f0, 0(%r3) @@ -74,13 +74,13 @@ define void @f5(fp128 *%dst, float *%base) { %ptr = getelementptr float, float *%base, i64 -1 %val = load float, float *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check that LXEB allows indices. -define void @f6(fp128 *%dst, float *%base, i64 %index) { +define void @f6(fp128 *%dst, float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r4, 2 ; CHECK: lxeb %f0, 400(%r1,%r3) @@ -91,8 +91,9 @@ define void @f6(fp128 *%dst, float *%base, i64 %index) { %ptr2 = getelementptr float, float *%ptr1, i64 100 %val = load float, float *%ptr2 %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll index 657cdcdfd50c0f..4ddfe1031c322c 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-04.ll @@ -5,20 +5,20 @@ declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata) ; Check register extension. -define void @f1(fp128 *%dst, double %val) { +define void @f1(fp128 *%dst, double %val) #0 { ; CHECK-LABEL: f1: ; CHECK: lxdbr %f0, %f0 ; CHECK: std %f0, 0(%r2) ; CHECK: std %f2, 8(%r2) ; CHECK: br %r14 %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the low end of the LXDB range. -define void @f2(fp128 *%dst, double *%ptr) { +define void @f2(fp128 *%dst, double *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: lxdb %f0, 0(%r3) ; CHECK: std %f0, 0(%r2) @@ -26,13 +26,13 @@ define void @f2(fp128 *%dst, double *%ptr) { ; CHECK: br %r14 %val = load double, double *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the high end of the aligned LXDB range. -define void @f3(fp128 *%dst, double *%base) { +define void @f3(fp128 *%dst, double *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: lxdb %f0, 4088(%r3) ; CHECK: std %f0, 0(%r2) @@ -41,14 +41,14 @@ define void @f3(fp128 *%dst, double *%base) { %ptr = getelementptr double, double *%base, i64 511 %val = load double, double *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the next doubleword up, which needs separate address logic. ; Other sequences besides this one would be OK. -define void @f4(fp128 *%dst, double *%base) { +define void @f4(fp128 *%dst, double *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r3, 4096 ; CHECK: lxdb %f0, 0(%r3) @@ -58,13 +58,13 @@ define void @f4(fp128 *%dst, double *%base) { %ptr = getelementptr double, double *%base, i64 512 %val = load double, double *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check negative displacements, which also need separate address logic. -define void @f5(fp128 *%dst, double *%base) { +define void @f5(fp128 *%dst, double *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r3, -8 ; CHECK: lxdb %f0, 0(%r3) @@ -74,13 +74,13 @@ define void @f5(fp128 *%dst, double *%base) { %ptr = getelementptr double, double *%base, i64 -1 %val = load double, double *%ptr %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check that LXDB allows indices. -define void @f6(fp128 *%dst, double *%base, i64 %index) { +define void @f6(fp128 *%dst, double *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r4, 3 ; CHECK: lxdb %f0, 800(%r1,%r3) @@ -91,8 +91,9 @@ define void @f6(fp128 *%dst, double *%base, i64 %index) { %ptr2 = getelementptr double, double *%ptr1, i64 100 %val = load double, double *%ptr2 %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll index b1a11e66e2a68c..abdb865d4df5b2 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-09.ll @@ -7,27 +7,27 @@ declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) declare i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128, metadata) ; Test f32->i32. -define i32 @f1(float %f) { +define i32 @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: cfebr %r2, 5, %f0 ; CHECK: br %r14 %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test f64->i32. -define i32 @f2(double %f) { +define i32 @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: cfdbr %r2, 5, %f0 ; CHECK: br %r14 %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test f128->i32. -define i32 @f3(fp128 *%src) { +define i32 @f3(fp128 *%src) #0 { ; CHECK-LABEL: f3: ; CHECK: ld %f0, 0(%r2) ; CHECK: ld %f2, 8(%r2) @@ -35,6 +35,8 @@ define i32 @f3(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll index cc6450a6c04b8b..aef02cd8cdb6d1 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-10.ll @@ -14,7 +14,7 @@ declare i32 @llvm.experimental.constrained.fptoui.i32.f64(double, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128, metadata) ; Test f32->i32. -define i32 @f1(float %f) { +define i32 @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI0_0 @@ -30,12 +30,12 @@ define i32 @f1(float %f) { ; CHECK-NEXT: xr %r2, %r0 ; CHECK-NEXT: br %r14 %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test f64->i32. -define i32 @f2(double %f) { +define i32 @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI1_0 @@ -51,12 +51,12 @@ define i32 @f2(double %f) { ; CHECK-NEXT: xr %r2, %r0 ; CHECK-NEXT: br %r14 %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test f128->i32. -define i32 @f3(fp128 *%src) { +define i32 @f3(fp128 *%src) #0 { ; CHECK-LABEL: f3: ; CHECK: # %bb.0: ; CHECK-NEXT: ld %f0, 0(%r2) @@ -75,6 +75,8 @@ define i32 @f3(fp128 *%src) { ; CHECK-NEXT: br %r14 %f = load fp128, fp128 *%src %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll index bc3e9b9b8a56b6..d9cc33700e7ae1 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-11.ll @@ -7,27 +7,27 @@ declare i64 @llvm.experimental.constrained.fptosi.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.fptosi.i64.f128(fp128, metadata) ; Test f32->i64. -define i64 @f1(float %f) { +define i64 @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: cgebr %r2, 5, %f0 ; CHECK: br %r14 %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f32(float %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } ; Test f64->i64. -define i64 @f2(double %f) { +define i64 @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: cgdbr %r2, 5, %f0 ; CHECK: br %r14 %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } ; Test f128->i64. -define i64 @f3(fp128 *%src) { +define i64 @f3(fp128 *%src) #0 { ; CHECK-LABEL: f3: ; CHECK: ld %f0, 0(%r2) ; CHECK: ld %f2, 8(%r2) @@ -35,6 +35,8 @@ define i64 @f3(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll index e9811cf2fd9c54..2d7618f499641d 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-12.ll @@ -13,7 +13,7 @@ declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128, metadata) ; Test f32->i64. -define i64 @f1(float %f) { +define i64 @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI0_0 @@ -29,12 +29,12 @@ define i64 @f1(float %f) { ; CHECK-NEXT: xgr %r2, %r0 ; CHECK-NEXT: br %r14 %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } ; Test f64->i64. -define i64 @f2(double %f) { +define i64 @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: ; CHECK-NEXT: larl %r1, .LCPI1_0 @@ -50,12 +50,12 @@ define i64 @f2(double %f) { ; CHECK-NEXT: xgr %r2, %r0 ; CHECK-NEXT: br %r14 %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } ; Test f128->i64. -define i64 @f3(fp128 *%src) { +define i64 @f3(fp128 *%src) #0 { ; CHECK-LABEL: f3: ; CHECK: # %bb.0: ; CHECK-NEXT: ld %f0, 0(%r2) @@ -74,6 +74,8 @@ define i64 @f3(fp128 *%src) { ; CHECK-NEXT: br %r14 %f = load fp128, fp128 *%src %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll index 70a02c55799559..83478bab74d7b3 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-14.ll @@ -11,27 +11,27 @@ declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128, metadata) ; Test f32->i32. -define i32 @f1(float %f) { +define i32 @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: clfebr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test f64->i32. -define i32 @f2(double %f) { +define i32 @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: clfdbr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test f128->i32. -define i32 @f3(fp128 *%src) { +define i32 @f3(fp128 *%src) #0 { ; CHECK-LABEL: f3: ; CHECK-DAG: ld %f0, 0(%r2) ; CHECK-DAG: ld %f2, 8(%r2) @@ -39,32 +39,32 @@ define i32 @f3(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test f32->i64. -define i64 @f4(float %f) { +define i64 @f4(float %f) #0 { ; CHECK-LABEL: f4: ; CHECK: clgebr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } ; Test f64->i64. -define i64 @f5(double %f) { +define i64 @f5(double %f) #0 { ; CHECK-LABEL: f5: ; CHECK: clgdbr %r2, 5, %f0, 0 ; CHECK: br %r14 %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } ; Test f128->i64. -define i64 @f6(fp128 *%src) { +define i64 @f6(fp128 *%src) #0 { ; CHECK-LABEL: f6: ; CHECK-DAG: ld %f0, 0(%r2) ; CHECK-DAG: ld %f2, 8(%r2) @@ -72,6 +72,8 @@ define i64 @f6(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll index 64a82c32fbef39..9b080c14e18a8c 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-15.ll @@ -9,7 +9,7 @@ declare fp128 @llvm.experimental.constrained.fpext.f128.f32(float, metadata) declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata) ; Test f128->f64. -define double @f1(fp128 *%ptr) { +define double @f1(fp128 *%ptr) #0 { ; CHECK-LABEL: f1: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wflrx %f0, [[REG]], 0, 0 @@ -18,12 +18,12 @@ define double @f1(fp128 *%ptr) { %res = call double @llvm.experimental.constrained.fptrunc.f64.f128( fp128 %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test f128->f32. -define float @f2(fp128 *%ptr) { +define float @f2(fp128 *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wflrx %f0, [[REG]], 0, 3 @@ -33,32 +33,33 @@ define float @f2(fp128 *%ptr) { %res = call float @llvm.experimental.constrained.fptrunc.f32.f128( fp128 %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test f64->f128. -define void @f3(fp128 *%dst, double %val) { +define void @f3(fp128 *%dst, double %val) #0 { ; CHECK-LABEL: f3: ; CHECK: wflld [[RES:%v[0-9]+]], %f0 ; CHECK: vst [[RES]], 0(%r2) ; CHECK: br %r14 %res = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Test f32->f128. -define void @f4(fp128 *%dst, float %val) { +define void @f4(fp128 *%dst, float %val) #0 { ; CHECK-LABEL: f4: ; CHECK: ldebr %f0, %f0 ; CHECK: wflld [[RES:%v[0-9]+]], %f0 ; CHECK: vst [[RES]], 0(%r2) ; CHECK: br %r14 %res = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %val, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-16.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-16.ll index fbbb608ac7c254..f2e4121d1f299f 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-16.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-16.ll @@ -11,7 +11,7 @@ declare i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128, metadata) declare i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128, metadata) ; Test signed f128->i32. -define i32 @f5(fp128 *%src) { +define i32 @f5(fp128 *%src) #0 { ; CHECK-LABEL: f5: ; CHECK: vl %v0, 0(%r2) ; CHECK: vrepg %v2, %v0, 1 @@ -19,12 +19,12 @@ define i32 @f5(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test signed f128->i64. -define i64 @f6(fp128 *%src) { +define i64 @f6(fp128 *%src) #0 { ; CHECK-LABEL: f6: ; CHECK: vl %v0, 0(%r2) ; CHECK: vrepg %v2, %v0, 1 @@ -32,12 +32,12 @@ define i64 @f6(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } ; Test unsigned f128->i32. -define i32 @f7(fp128 *%src) { +define i32 @f7(fp128 *%src) #0 { ; CHECK-LABEL: f7: ; CHECK: vl %v0, 0(%r2) ; CHECK: vrepg %v2, %v0, 1 @@ -45,12 +45,12 @@ define i32 @f7(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %conv } ; Test unsigned f128->i64. -define i64 @f8(fp128 *%src) { +define i64 @f8(fp128 *%src) #0 { ; CHECK-LABEL: f8: ; CHECK: vl %v0, 0(%r2) ; CHECK: vrepg %v2, %v0, 1 @@ -58,6 +58,8 @@ define i64 @f8(fp128 *%src) { ; CHECK: br %r14 %f = load fp128, fp128 *%src %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128 %f, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i64 %conv } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll index 7f9a4fce17eecc..27cb70cd06fb9f 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-div-01.ll @@ -8,19 +8,19 @@ declare float @foo() declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata) ; Check register division. -define float @f1(float %f1, float %f2) { +define float @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: debr %f0, %f2 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.fdiv.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the low end of the DEB range. -define float @f2(float %f1, float *%ptr) { +define float @f2(float %f1, float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: deb %f0, 0(%r2) ; CHECK: br %r14 @@ -28,12 +28,12 @@ define float @f2(float %f1, float *%ptr) { %res = call float @llvm.experimental.constrained.fdiv.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the high end of the aligned DEB range. -define float @f3(float %f1, float *%base) { +define float @f3(float %f1, float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: deb %f0, 4092(%r2) ; CHECK: br %r14 @@ -42,13 +42,13 @@ define float @f3(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fdiv.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define float @f4(float %f1, float *%base) { +define float @f4(float %f1, float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: deb %f0, 0(%r2) @@ -58,12 +58,12 @@ define float @f4(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fdiv.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check negative displacements, which also need separate address logic. -define float @f5(float %f1, float *%base) { +define float @f5(float %f1, float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: deb %f0, 0(%r2) @@ -73,12 +73,12 @@ define float @f5(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fdiv.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that DEB allows indices. -define float @f6(float %f1, float *%base, i64 %index) { +define float @f6(float %f1, float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: deb %f0, 400(%r1,%r2) @@ -89,12 +89,12 @@ define float @f6(float %f1, float *%base, i64 %index) { %res = call float @llvm.experimental.constrained.fdiv.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that divisions of spilled values can use DEB rather than DEBR. -define float @f7(float *%ptr0) { +define float @f7(float *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: deb %f0, 16{{[04]}}(%r15) @@ -122,52 +122,54 @@ define float @f7(float *%ptr0) { %val9 = load float, float *%ptr9 %val10 = load float, float *%ptr10 - %ret = call float @foo() + %ret = call float @foo() #0 %div0 = call float @llvm.experimental.constrained.fdiv.f32( float %ret, float %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div1 = call float @llvm.experimental.constrained.fdiv.f32( float %div0, float %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div2 = call float @llvm.experimental.constrained.fdiv.f32( float %div1, float %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div3 = call float @llvm.experimental.constrained.fdiv.f32( float %div2, float %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div4 = call float @llvm.experimental.constrained.fdiv.f32( float %div3, float %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div5 = call float @llvm.experimental.constrained.fdiv.f32( float %div4, float %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div6 = call float @llvm.experimental.constrained.fdiv.f32( float %div5, float %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div7 = call float @llvm.experimental.constrained.fdiv.f32( float %div6, float %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div8 = call float @llvm.experimental.constrained.fdiv.f32( float %div7, float %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div9 = call float @llvm.experimental.constrained.fdiv.f32( float %div8, float %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div10 = call float @llvm.experimental.constrained.fdiv.f32( float %div9, float %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %div10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-div-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-div-02.ll index 850af2172f0130..d8fc2bd7430ab6 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-div-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-div-02.ll @@ -8,19 +8,19 @@ declare double @foo() declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) ; Check register division. -define double @f1(double %f1, double %f2) { +define double @f1(double %f1, double %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: ddbr %f0, %f2 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.fdiv.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the low end of the DDB range. -define double @f2(double %f1, double *%ptr) { +define double @f2(double %f1, double *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: ddb %f0, 0(%r2) ; CHECK: br %r14 @@ -28,12 +28,12 @@ define double @f2(double %f1, double *%ptr) { %res = call double @llvm.experimental.constrained.fdiv.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the high end of the aligned DDB range. -define double @f3(double %f1, double *%base) { +define double @f3(double %f1, double *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: ddb %f0, 4088(%r2) ; CHECK: br %r14 @@ -42,13 +42,13 @@ define double @f3(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fdiv.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the next doubleword up, which needs separate address logic. ; Other sequences besides this one would be OK. -define double @f4(double %f1, double *%base) { +define double @f4(double %f1, double *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: ddb %f0, 0(%r2) @@ -58,12 +58,12 @@ define double @f4(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fdiv.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check negative displacements, which also need separate address logic. -define double @f5(double %f1, double *%base) { +define double @f5(double %f1, double *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -8 ; CHECK: ddb %f0, 0(%r2) @@ -73,12 +73,12 @@ define double @f5(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fdiv.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that DDB allows indices. -define double @f6(double %f1, double *%base, i64 %index) { +define double @f6(double %f1, double *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: ddb %f0, 800(%r1,%r2) @@ -89,12 +89,12 @@ define double @f6(double %f1, double *%base, i64 %index) { %res = call double @llvm.experimental.constrained.fdiv.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that divisions of spilled values can use DDB rather than DDBR. -define double @f7(double *%ptr0) { +define double @f7(double *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: ddb %f0, 160(%r15) @@ -122,52 +122,54 @@ define double @f7(double *%ptr0) { %val9 = load double, double *%ptr9 %val10 = load double, double *%ptr10 - %ret = call double @foo() + %ret = call double @foo() #0 %div0 = call double @llvm.experimental.constrained.fdiv.f64( double %ret, double %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div1 = call double @llvm.experimental.constrained.fdiv.f64( double %div0, double %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div2 = call double @llvm.experimental.constrained.fdiv.f64( double %div1, double %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div3 = call double @llvm.experimental.constrained.fdiv.f64( double %div2, double %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div4 = call double @llvm.experimental.constrained.fdiv.f64( double %div3, double %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div5 = call double @llvm.experimental.constrained.fdiv.f64( double %div4, double %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div6 = call double @llvm.experimental.constrained.fdiv.f64( double %div5, double %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div7 = call double @llvm.experimental.constrained.fdiv.f64( double %div6, double %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div8 = call double @llvm.experimental.constrained.fdiv.f64( double %div7, double %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div9 = call double @llvm.experimental.constrained.fdiv.f64( double %div8, double %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %div10 = call double @llvm.experimental.constrained.fdiv.f64( double %div9, double %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %div10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-div-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-div-03.ll index 860f91d8ed8bc4..fcd2184ac4fe09 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-div-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-div-03.ll @@ -5,7 +5,7 @@ declare fp128 @llvm.experimental.constrained.fdiv.f128(fp128, fp128, metadata, metadata) ; There is no memory form of 128-bit division. -define void @f1(fp128 *%ptr, float %f2) { +define void @f1(fp128 *%ptr, float %f2) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: lxebr %f0, %f0 ; CHECK-DAG: ld %f1, 0(%r2) @@ -19,7 +19,7 @@ define void @f1(fp128 *%ptr, float %f2) { %sum = call fp128 @llvm.experimental.constrained.fdiv.f128( fp128 %f1, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %sum, fp128 *%ptr ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-div-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-div-04.ll index e3f117515aa174..a43eebb5f0a8d2 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-div-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-div-04.ll @@ -4,7 +4,7 @@ declare fp128 @llvm.experimental.constrained.fdiv.f128(fp128, fp128, metadata, metadata) -define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +define void @f1(fp128 *%ptr1, fp128 *%ptr2) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) ; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) @@ -16,7 +16,7 @@ define void @f1(fp128 *%ptr1, fp128 *%ptr2) { %sum = call fp128 @llvm.experimental.constrained.fdiv.f128( fp128 %f1, fp128 %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %sum, fp128 *%ptr1 ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-01.ll index 3e07091bf3c6c1..623ef9007f6104 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-01.ll @@ -8,19 +8,19 @@ declare float @foo() declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) ; Check register multiplication. -define float @f1(float %f1, float %f2) { +define float @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: meebr %f0, %f2 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.fmul.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the low end of the MEEB range. -define float @f2(float %f1, float *%ptr) { +define float @f2(float %f1, float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: meeb %f0, 0(%r2) ; CHECK: br %r14 @@ -28,12 +28,12 @@ define float @f2(float %f1, float *%ptr) { %res = call float @llvm.experimental.constrained.fmul.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the high end of the aligned MEEB range. -define float @f3(float %f1, float *%base) { +define float @f3(float %f1, float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: meeb %f0, 4092(%r2) ; CHECK: br %r14 @@ -42,13 +42,13 @@ define float @f3(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fmul.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define float @f4(float %f1, float *%base) { +define float @f4(float %f1, float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: meeb %f0, 0(%r2) @@ -58,12 +58,12 @@ define float @f4(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fmul.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check negative displacements, which also need separate address logic. -define float @f5(float %f1, float *%base) { +define float @f5(float %f1, float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: meeb %f0, 0(%r2) @@ -73,12 +73,12 @@ define float @f5(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fmul.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that MEEB allows indices. -define float @f6(float %f1, float *%base, i64 %index) { +define float @f6(float %f1, float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: meeb %f0, 400(%r1,%r2) @@ -89,12 +89,12 @@ define float @f6(float %f1, float *%base, i64 %index) { %res = call float @llvm.experimental.constrained.fmul.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that multiplications of spilled values can use MEEB rather than MEEBR. -define float @f7(float *%ptr0) { +define float @f7(float *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: meeb %f0, 16{{[04]}}(%r15) @@ -122,52 +122,54 @@ define float @f7(float *%ptr0) { %val9 = load float, float *%ptr9 %val10 = load float, float *%ptr10 - %ret = call float @foo() + %ret = call float @foo() #0 %mul0 = call float @llvm.experimental.constrained.fmul.f32( float %ret, float %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul1 = call float @llvm.experimental.constrained.fmul.f32( float %mul0, float %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul2 = call float @llvm.experimental.constrained.fmul.f32( float %mul1, float %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul3 = call float @llvm.experimental.constrained.fmul.f32( float %mul2, float %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul4 = call float @llvm.experimental.constrained.fmul.f32( float %mul3, float %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul5 = call float @llvm.experimental.constrained.fmul.f32( float %mul4, float %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul6 = call float @llvm.experimental.constrained.fmul.f32( float %mul5, float %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul7 = call float @llvm.experimental.constrained.fmul.f32( float %mul6, float %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul8 = call float @llvm.experimental.constrained.fmul.f32( float %mul7, float %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul9 = call float @llvm.experimental.constrained.fmul.f32( float %mul8, float %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul10 = call float @llvm.experimental.constrained.fmul.f32( float %mul9, float %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %mul10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll index 6f080f6e4ff8e0..7acabef29f4ea4 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-02.ll @@ -7,7 +7,7 @@ declare float @foo() declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) ; Check register multiplication. -define double @f1(float %f1, float %f2) { +define double @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: mdebr %f0, %f2 ; CHECK: br %r14 @@ -16,12 +16,12 @@ define double @f1(float %f1, float %f2) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the low end of the MDEB range. -define double @f2(float %f1, float *%ptr) { +define double @f2(float %f1, float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: mdeb %f0, 0(%r2) ; CHECK: br %r14 @@ -31,12 +31,12 @@ define double @f2(float %f1, float *%ptr) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the high end of the aligned MDEB range. -define double @f3(float %f1, float *%base) { +define double @f3(float %f1, float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: mdeb %f0, 4092(%r2) ; CHECK: br %r14 @@ -47,13 +47,13 @@ define double @f3(float %f1, float *%base) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define double @f4(float %f1, float *%base) { +define double @f4(float %f1, float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: mdeb %f0, 0(%r2) @@ -65,12 +65,12 @@ define double @f4(float %f1, float *%base) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check negative displacements, which also need separate address logic. -define double @f5(float %f1, float *%base) { +define double @f5(float %f1, float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: mdeb %f0, 0(%r2) @@ -82,12 +82,12 @@ define double @f5(float %f1, float *%base) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that MDEB allows indices. -define double @f6(float %f1, float *%base, i64 %index) { +define double @f6(float %f1, float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: mdeb %f0, 400(%r1,%r2) @@ -100,12 +100,12 @@ define double @f6(float %f1, float *%base, i64 %index) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1x, double %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that multiplications of spilled values can use MDEB rather than MDEBR. -define float @f7(float *%ptr0) { +define float @f7(float *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK: mdeb %f0, 16{{[04]}}(%r15) @@ -157,18 +157,18 @@ define float @f7(float *%ptr0) { store float %frob9, float *%ptr9 store float %frob10, float *%ptr10 - %ret = call float @foo() + %ret = call float @foo() #0 %accext0 = fpext float %ret to double %ext0 = fpext float %frob0 to double %mul0 = call double @llvm.experimental.constrained.fmul.f64( double %accext0, double %ext0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra0 = call double @llvm.experimental.constrained.fmul.f64( double %mul0, double 1.01, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc0 = fptrunc double %extra0 to float %accext1 = fpext float %trunc0 to double @@ -176,11 +176,11 @@ define float @f7(float *%ptr0) { %mul1 = call double @llvm.experimental.constrained.fmul.f64( double %accext1, double %ext1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra1 = call double @llvm.experimental.constrained.fmul.f64( double %mul1, double 1.11, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc1 = fptrunc double %extra1 to float %accext2 = fpext float %trunc1 to double @@ -188,11 +188,11 @@ define float @f7(float *%ptr0) { %mul2 = call double @llvm.experimental.constrained.fmul.f64( double %accext2, double %ext2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra2 = call double @llvm.experimental.constrained.fmul.f64( double %mul2, double 1.21, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc2 = fptrunc double %extra2 to float %accext3 = fpext float %trunc2 to double @@ -200,11 +200,11 @@ define float @f7(float *%ptr0) { %mul3 = call double @llvm.experimental.constrained.fmul.f64( double %accext3, double %ext3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra3 = call double @llvm.experimental.constrained.fmul.f64( double %mul3, double 1.31, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc3 = fptrunc double %extra3 to float %accext4 = fpext float %trunc3 to double @@ -212,11 +212,11 @@ define float @f7(float *%ptr0) { %mul4 = call double @llvm.experimental.constrained.fmul.f64( double %accext4, double %ext4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra4 = call double @llvm.experimental.constrained.fmul.f64( double %mul4, double 1.41, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc4 = fptrunc double %extra4 to float %accext5 = fpext float %trunc4 to double @@ -224,11 +224,11 @@ define float @f7(float *%ptr0) { %mul5 = call double @llvm.experimental.constrained.fmul.f64( double %accext5, double %ext5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra5 = call double @llvm.experimental.constrained.fmul.f64( double %mul5, double 1.51, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc5 = fptrunc double %extra5 to float %accext6 = fpext float %trunc5 to double @@ -236,11 +236,11 @@ define float @f7(float *%ptr0) { %mul6 = call double @llvm.experimental.constrained.fmul.f64( double %accext6, double %ext6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra6 = call double @llvm.experimental.constrained.fmul.f64( double %mul6, double 1.61, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc6 = fptrunc double %extra6 to float %accext7 = fpext float %trunc6 to double @@ -248,11 +248,11 @@ define float @f7(float *%ptr0) { %mul7 = call double @llvm.experimental.constrained.fmul.f64( double %accext7, double %ext7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra7 = call double @llvm.experimental.constrained.fmul.f64( double %mul7, double 1.71, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc7 = fptrunc double %extra7 to float %accext8 = fpext float %trunc7 to double @@ -260,11 +260,11 @@ define float @f7(float *%ptr0) { %mul8 = call double @llvm.experimental.constrained.fmul.f64( double %accext8, double %ext8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra8 = call double @llvm.experimental.constrained.fmul.f64( double %mul8, double 1.81, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc8 = fptrunc double %extra8 to float %accext9 = fpext float %trunc8 to double @@ -272,12 +272,14 @@ define float @f7(float *%ptr0) { %mul9 = call double @llvm.experimental.constrained.fmul.f64( double %accext9, double %ext9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %extra9 = call double @llvm.experimental.constrained.fmul.f64( double %mul9, double 1.91, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc9 = fptrunc double %extra9 to float ret float %trunc9 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-03.ll index 736bd2e506fa66..edfc5d46ba9284 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-03.ll @@ -8,19 +8,19 @@ declare double @foo() declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) ; Check register multiplication. -define double @f1(double %f1, double %f2) { +define double @f1(double %f1, double %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: mdbr %f0, %f2 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.fmul.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the low end of the MDB range. -define double @f2(double %f1, double *%ptr) { +define double @f2(double %f1, double *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: mdb %f0, 0(%r2) ; CHECK: br %r14 @@ -28,12 +28,12 @@ define double @f2(double %f1, double *%ptr) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the high end of the aligned MDB range. -define double @f3(double %f1, double *%base) { +define double @f3(double %f1, double *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: mdb %f0, 4088(%r2) ; CHECK: br %r14 @@ -42,13 +42,13 @@ define double @f3(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the next doubleword up, which needs separate address logic. ; Other sequences besides this one would be OK. -define double @f4(double %f1, double *%base) { +define double @f4(double %f1, double *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: mdb %f0, 0(%r2) @@ -58,12 +58,12 @@ define double @f4(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check negative displacements, which also need separate address logic. -define double @f5(double %f1, double *%base) { +define double @f5(double %f1, double *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -8 ; CHECK: mdb %f0, 0(%r2) @@ -73,12 +73,12 @@ define double @f5(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that MDB allows indices. -define double @f6(double %f1, double *%base, i64 %index) { +define double @f6(double %f1, double *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: mdb %f0, 800(%r1,%r2) @@ -89,12 +89,12 @@ define double @f6(double %f1, double *%base, i64 %index) { %res = call double @llvm.experimental.constrained.fmul.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that multiplications of spilled values can use MDB rather than MDBR. -define double @f7(double *%ptr0) { +define double @f7(double *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: mdb %f0, 160(%r15) @@ -122,52 +122,54 @@ define double @f7(double *%ptr0) { %val9 = load double, double *%ptr9 %val10 = load double, double *%ptr10 - %ret = call double @foo() + %ret = call double @foo() #0 %mul0 = call double @llvm.experimental.constrained.fmul.f64( double %ret, double %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul1 = call double @llvm.experimental.constrained.fmul.f64( double %mul0, double %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul2 = call double @llvm.experimental.constrained.fmul.f64( double %mul1, double %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul3 = call double @llvm.experimental.constrained.fmul.f64( double %mul2, double %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul4 = call double @llvm.experimental.constrained.fmul.f64( double %mul3, double %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul5 = call double @llvm.experimental.constrained.fmul.f64( double %mul4, double %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul6 = call double @llvm.experimental.constrained.fmul.f64( double %mul5, double %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul7 = call double @llvm.experimental.constrained.fmul.f64( double %mul6, double %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul8 = call double @llvm.experimental.constrained.fmul.f64( double %mul7, double %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul9 = call double @llvm.experimental.constrained.fmul.f64( double %mul8, double %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul10 = call double @llvm.experimental.constrained.fmul.f64( double %mul9, double %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %mul10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll index a613030792ec3e..924845a99d746e 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-04.ll @@ -10,7 +10,7 @@ declare double @foo() ; Check register multiplication. "mxdbr %f0, %f2" is not valid from LLVM's ; point of view, because %f2 is the low register of the FP128 %f0. Pass the ; multiplier in %f4 instead. -define void @f1(double %f1, double %dummy, double %f2, fp128 *%dst) { +define void @f1(double %f1, double %dummy, double %f2, fp128 *%dst) #0 { ; CHECK-LABEL: f1: ; CHECK: mxdbr %f0, %f4 ; CHECK: std %f0, 0(%r2) @@ -21,13 +21,13 @@ define void @f1(double %f1, double %dummy, double %f2, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the low end of the MXDB range. -define void @f2(double %f1, double *%ptr, fp128 *%dst) { +define void @f2(double %f1, double *%ptr, fp128 *%dst) #0 { ; CHECK-LABEL: f2: ; CHECK: mxdb %f0, 0(%r2) ; CHECK: std %f0, 0(%r3) @@ -39,13 +39,13 @@ define void @f2(double %f1, double *%ptr, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the high end of the aligned MXDB range. -define void @f3(double %f1, double *%base, fp128 *%dst) { +define void @f3(double %f1, double *%base, fp128 *%dst) #0 { ; CHECK-LABEL: f3: ; CHECK: mxdb %f0, 4088(%r2) ; CHECK: std %f0, 0(%r3) @@ -58,14 +58,14 @@ define void @f3(double %f1, double *%base, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check the next doubleword up, which needs separate address logic. ; Other sequences besides this one would be OK. -define void @f4(double %f1, double *%base, fp128 *%dst) { +define void @f4(double %f1, double *%base, fp128 *%dst) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: mxdb %f0, 0(%r2) @@ -79,13 +79,13 @@ define void @f4(double %f1, double *%base, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check negative displacements, which also need separate address logic. -define void @f5(double %f1, double *%base, fp128 *%dst) { +define void @f5(double %f1, double *%base, fp128 *%dst) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -8 ; CHECK: mxdb %f0, 0(%r2) @@ -99,13 +99,13 @@ define void @f5(double %f1, double *%base, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check that MXDB allows indices. -define void @f6(double %f1, double *%base, i64 %index, fp128 *%dst) { +define void @f6(double %f1, double *%base, i64 %index, fp128 *%dst) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: mxdb %f0, 800(%r1,%r2) @@ -120,13 +120,13 @@ define void @f6(double %f1, double *%base, i64 %index, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } ; Check that multiplications of spilled values can use MXDB rather than MXDBR. -define double @f7(double *%ptr0) { +define double @f7(double *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK: mxdb %f0, 160(%r15) @@ -178,19 +178,19 @@ define double @f7(double *%ptr0) { store double %frob9, double *%ptr9 store double %frob10, double *%ptr10 - %ret = call double @foo() + %ret = call double @foo() #0 %accext0 = fpext double %ret to fp128 %ext0 = fpext double %frob0 to fp128 %mul0 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext0, fp128 %ext0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const0 = fpext double 1.01 to fp128 %extra0 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul0, fp128 %const0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc0 = fptrunc fp128 %extra0 to double %accext1 = fpext double %trunc0 to fp128 @@ -198,12 +198,12 @@ define double @f7(double *%ptr0) { %mul1 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext1, fp128 %ext1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const1 = fpext double 1.11 to fp128 %extra1 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul1, fp128 %const1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc1 = fptrunc fp128 %extra1 to double %accext2 = fpext double %trunc1 to fp128 @@ -211,12 +211,12 @@ define double @f7(double *%ptr0) { %mul2 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext2, fp128 %ext2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const2 = fpext double 1.21 to fp128 %extra2 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul2, fp128 %const2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc2 = fptrunc fp128 %extra2 to double %accext3 = fpext double %trunc2 to fp128 @@ -224,12 +224,12 @@ define double @f7(double *%ptr0) { %mul3 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext3, fp128 %ext3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const3 = fpext double 1.31 to fp128 %extra3 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul3, fp128 %const3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc3 = fptrunc fp128 %extra3 to double %accext4 = fpext double %trunc3 to fp128 @@ -237,12 +237,12 @@ define double @f7(double *%ptr0) { %mul4 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext4, fp128 %ext4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const4 = fpext double 1.41 to fp128 %extra4 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul4, fp128 %const4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc4 = fptrunc fp128 %extra4 to double %accext5 = fpext double %trunc4 to fp128 @@ -250,12 +250,12 @@ define double @f7(double *%ptr0) { %mul5 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext5, fp128 %ext5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const5 = fpext double 1.51 to fp128 %extra5 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul5, fp128 %const5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc5 = fptrunc fp128 %extra5 to double %accext6 = fpext double %trunc5 to fp128 @@ -263,12 +263,12 @@ define double @f7(double *%ptr0) { %mul6 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext6, fp128 %ext6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const6 = fpext double 1.61 to fp128 %extra6 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul6, fp128 %const6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc6 = fptrunc fp128 %extra6 to double %accext7 = fpext double %trunc6 to fp128 @@ -276,12 +276,12 @@ define double @f7(double *%ptr0) { %mul7 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext7, fp128 %ext7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const7 = fpext double 1.71 to fp128 %extra7 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul7, fp128 %const7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc7 = fptrunc fp128 %extra7 to double %accext8 = fpext double %trunc7 to fp128 @@ -289,12 +289,12 @@ define double @f7(double *%ptr0) { %mul8 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext8, fp128 %ext8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const8 = fpext double 1.81 to fp128 %extra8 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul8, fp128 %const8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc8 = fptrunc fp128 %extra8 to double %accext9 = fpext double %trunc8 to fp128 @@ -302,13 +302,15 @@ define double @f7(double *%ptr0) { %mul9 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %accext9, fp128 %ext9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %const9 = fpext double 1.91 to fp128 %extra9 = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %mul9, fp128 %const9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %trunc9 = fptrunc fp128 %extra9 to double ret double %trunc9 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-05.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-05.ll index 32bab44bff38fe..0a8ee0bf7bd607 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-05.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-05.ll @@ -5,7 +5,7 @@ declare fp128 @llvm.experimental.constrained.fmul.f128(fp128, fp128, metadata, metadata) ; There is no memory form of 128-bit multiplication. -define void @f1(fp128 *%ptr, float %f2) { +define void @f1(fp128 *%ptr, float %f2) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: lxebr %f0, %f0 ; CHECK-DAG: ld %f1, 0(%r2) @@ -19,7 +19,7 @@ define void @f1(fp128 *%ptr, float %f2) { %diff = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %diff, fp128 *%ptr ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll index 0de99aeedc9281..0f6405459168b8 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll @@ -5,7 +5,7 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) -define float @f1(float %f1, float %f2, float %acc) { +define float @f1(float %f1, float %f2, float %acc) #0 { ; CHECK-LABEL: f1: ; CHECK-SCALAR: maebr %f4, %f0, %f2 ; CHECK-SCALAR: ler %f0, %f4 @@ -14,11 +14,11 @@ define float @f1(float %f1, float %f2, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f2(float %f1, float *%ptr, float %acc) { +define float @f2(float %f1, float *%ptr, float %acc) #0 { ; CHECK-LABEL: f2: ; CHECK: maeb %f2, %f0, 0(%r2) ; CHECK-SCALAR: ler %f0, %f2 @@ -28,11 +28,11 @@ define float @f2(float %f1, float *%ptr, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f3(float %f1, float *%base, float %acc) { +define float @f3(float %f1, float *%base, float %acc) #0 { ; CHECK-LABEL: f3: ; CHECK: maeb %f2, %f0, 4092(%r2) ; CHECK-SCALAR: ler %f0, %f2 @@ -43,11 +43,11 @@ define float @f3(float %f1, float *%base, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f4(float %f1, float *%base, float %acc) { +define float @f4(float %f1, float *%base, float %acc) #0 { ; The important thing here is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -62,11 +62,11 @@ define float @f4(float %f1, float *%base, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f5(float %f1, float *%base, float %acc) { +define float @f5(float %f1, float *%base, float %acc) #0 { ; Here too the important thing is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -81,11 +81,11 @@ define float @f5(float %f1, float *%base, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f6(float %f1, float *%base, i64 %index, float %acc) { +define float @f6(float %f1, float *%base, i64 %index, float %acc) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: maeb %f2, %f0, 0(%r1,%r2) @@ -97,11 +97,11 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f7(float %f1, float *%base, i64 %index, float %acc) { +define float @f7(float %f1, float *%base, i64 %index, float %acc) #0 { ; CHECK-LABEL: f7: ; CHECK: sllg %r1, %r3, 2 ; CHECK: maeb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}}) @@ -114,11 +114,11 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f8(float %f1, float *%base, i64 %index, float %acc) { +define float @f8(float %f1, float *%base, i64 %index, float %acc) #0 { ; CHECK-LABEL: f8: ; CHECK: sllg %r1, %r3, 2 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}}) @@ -132,6 +132,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-07.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-07.ll index b088aae16ad13e..d929fbba0ff499 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-07.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-07.ll @@ -5,7 +5,7 @@ declare double @llvm.experimental.constrained.fma.f64(double %f1, double %f2, double %f3, metadata, metadata) -define double @f1(double %f1, double %f2, double %acc) { +define double @f1(double %f1, double %f2, double %acc) #0 { ; CHECK-LABEL: f1: ; CHECK-SCALAR: madbr %f4, %f0, %f2 ; CHECK-SCALAR: ldr %f0, %f4 @@ -14,11 +14,11 @@ define double @f1(double %f1, double %f2, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f2(double %f1, double *%ptr, double %acc) { +define double @f2(double %f1, double *%ptr, double %acc) #0 { ; CHECK-LABEL: f2: ; CHECK: madb %f2, %f0, 0(%r2) ; CHECK: ldr %f0, %f2 @@ -27,11 +27,11 @@ define double @f2(double %f1, double *%ptr, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f3(double %f1, double *%base, double %acc) { +define double @f3(double %f1, double *%base, double %acc) #0 { ; CHECK-LABEL: f3: ; CHECK: madb %f2, %f0, 4088(%r2) ; CHECK: ldr %f0, %f2 @@ -41,11 +41,11 @@ define double @f3(double %f1, double *%base, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f4(double %f1, double *%base, double %acc) { +define double @f4(double %f1, double *%base, double %acc) #0 { ; The important thing here is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -59,11 +59,11 @@ define double @f4(double %f1, double *%base, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f5(double %f1, double *%base, double %acc) { +define double @f5(double %f1, double *%base, double %acc) #0 { ; Here too the important thing is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -77,11 +77,11 @@ define double @f5(double %f1, double *%base, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f6(double %f1, double *%base, i64 %index, double %acc) { +define double @f6(double %f1, double *%base, i64 %index, double %acc) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: madb %f2, %f0, 0(%r1,%r2) @@ -92,11 +92,11 @@ define double @f6(double %f1, double *%base, i64 %index, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f7(double %f1, double *%base, i64 %index, double %acc) { +define double @f7(double %f1, double *%base, i64 %index, double %acc) #0 { ; CHECK-LABEL: f7: ; CHECK: sllg %r1, %r3, 3 ; CHECK: madb %f2, %f0, 4088({{%r1,%r2|%r2,%r1}}) @@ -108,11 +108,11 @@ define double @f7(double %f1, double *%base, i64 %index, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f8(double %f1, double *%base, i64 %index, double %acc) { +define double @f8(double %f1, double *%base, i64 %index, double %acc) #0 { ; CHECK-LABEL: f8: ; CHECK: sllg %r1, %r3, 3 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}}) @@ -125,6 +125,8 @@ define double @f8(double %f1, double *%base, i64 %index, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-08.ll index ea3b7a1dba4b81..4c5101a795c0da 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-08.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-08.ll @@ -5,7 +5,7 @@ declare float @llvm.experimental.constrained.fma.f32(float %f1, float %f2, float %f3, metadata, metadata) -define float @f1(float %f1, float %f2, float %acc) { +define float @f1(float %f1, float %f2, float %acc) #0 { ; CHECK-LABEL: f1: ; CHECK-SCALAR: msebr %f4, %f0, %f2 ; CHECK-SCALAR: ler %f0, %f4 @@ -15,11 +15,11 @@ define float @f1(float %f1, float %f2, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f2(float %f1, float *%ptr, float %acc) { +define float @f2(float %f1, float *%ptr, float %acc) #0 { ; CHECK-LABEL: f2: ; CHECK: mseb %f2, %f0, 0(%r2) ; CHECK-SCALAR: ler %f0, %f2 @@ -30,11 +30,11 @@ define float @f2(float %f1, float *%ptr, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f3(float %f1, float *%base, float %acc) { +define float @f3(float %f1, float *%base, float %acc) #0 { ; CHECK-LABEL: f3: ; CHECK: mseb %f2, %f0, 4092(%r2) ; CHECK-SCALAR: ler %f0, %f2 @@ -46,11 +46,11 @@ define float @f3(float %f1, float *%base, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f4(float %f1, float *%base, float %acc) { +define float @f4(float %f1, float *%base, float %acc) #0 { ; The important thing here is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -66,11 +66,11 @@ define float @f4(float %f1, float *%base, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f5(float %f1, float *%base, float %acc) { +define float @f5(float %f1, float *%base, float %acc) #0 { ; Here too the important thing is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -86,11 +86,11 @@ define float @f5(float %f1, float *%base, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f6(float %f1, float *%base, i64 %index, float %acc) { +define float @f6(float %f1, float *%base, i64 %index, float %acc) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: mseb %f2, %f0, 0(%r1,%r2) @@ -103,11 +103,11 @@ define float @f6(float %f1, float *%base, i64 %index, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f7(float %f1, float *%base, i64 %index, float %acc) { +define float @f7(float %f1, float *%base, i64 %index, float %acc) #0 { ; CHECK-LABEL: f7: ; CHECK: sllg %r1, %r3, 2 ; CHECK: mseb %f2, %f0, 4092({{%r1,%r2|%r2,%r1}}) @@ -121,11 +121,11 @@ define float @f7(float %f1, float *%base, i64 %index, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f8(float %f1, float *%base, i64 %index, float %acc) { +define float @f8(float %f1, float *%base, i64 %index, float %acc) #0 { ; CHECK-LABEL: f8: ; CHECK: sllg %r1, %r3, 2 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}}) @@ -140,6 +140,8 @@ define float @f8(float %f1, float *%base, i64 %index, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-09.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-09.ll index e8f6eeb4165284..357148c3b012fb 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-09.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-09.ll @@ -5,7 +5,7 @@ declare double @llvm.experimental.constrained.fma.f64(double %f1, double %f2, double %f3, metadata, metadata) -define double @f1(double %f1, double %f2, double %acc) { +define double @f1(double %f1, double %f2, double %acc) #0 { ; CHECK-LABEL: f1: ; CHECK-SCALAR: msdbr %f4, %f0, %f2 ; CHECK-SCALAR: ldr %f0, %f4 @@ -15,11 +15,11 @@ define double @f1(double %f1, double %f2, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f2(double %f1, double *%ptr, double %acc) { +define double @f2(double %f1, double *%ptr, double %acc) #0 { ; CHECK-LABEL: f2: ; CHECK: msdb %f2, %f0, 0(%r2) ; CHECK: ldr %f0, %f2 @@ -29,11 +29,11 @@ define double @f2(double %f1, double *%ptr, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f3(double %f1, double *%base, double %acc) { +define double @f3(double %f1, double *%base, double %acc) #0 { ; CHECK-LABEL: f3: ; CHECK: msdb %f2, %f0, 4088(%r2) ; CHECK: ldr %f0, %f2 @@ -44,11 +44,11 @@ define double @f3(double %f1, double *%base, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f4(double %f1, double *%base, double %acc) { +define double @f4(double %f1, double *%base, double %acc) #0 { ; The important thing here is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -63,11 +63,11 @@ define double @f4(double %f1, double *%base, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f5(double %f1, double *%base, double %acc) { +define double @f5(double %f1, double *%base, double %acc) #0 { ; Here too the important thing is that we don't generate an out-of-range ; displacement. Other sequences besides this one would be OK. ; @@ -82,11 +82,11 @@ define double @f5(double %f1, double *%base, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f6(double %f1, double *%base, i64 %index, double %acc) { +define double @f6(double %f1, double *%base, i64 %index, double %acc) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: msdb %f2, %f0, 0(%r1,%r2) @@ -98,11 +98,11 @@ define double @f6(double %f1, double *%base, i64 %index, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f7(double %f1, double *%base, i64 %index, double %acc) { +define double @f7(double %f1, double *%base, i64 %index, double %acc) #0 { ; CHECK-LABEL: f7: ; CHECK: sllg %r1, %r3, 3 ; CHECK: msdb %f2, %f0, 4088({{%r1,%r2|%r2,%r1}}) @@ -115,11 +115,11 @@ define double @f7(double %f1, double *%base, i64 %index, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f8(double %f1, double *%base, i64 %index, double %acc) { +define double @f8(double %f1, double *%base, i64 %index, double %acc) #0 { ; CHECK-LABEL: f8: ; CHECK: sllg %r1, %r3, 3 ; CHECK: lay %r1, 4096({{%r1,%r2|%r2,%r1}}) @@ -133,6 +133,8 @@ define double @f8(double %f1, double *%base, i64 %index, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-10.ll index dc0a4bbccbd5e9..a4291ec8d3730f 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-10.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-10.ll @@ -3,19 +3,19 @@ declare double @llvm.experimental.constrained.fma.f64(double %f1, double %f2, double %f3, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float %f1, float %f2, float %f3, metadata, metadata) -define double @f1(double %f1, double %f2, double %acc) { +define double @f1(double %f1, double %f2, double %acc) #0 { ; CHECK-LABEL: f1: ; CHECK: wfnmadb %f0, %f0, %f2, %f4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negres = fsub double -0.0, %res ret double %negres } -define double @f2(double %f1, double %f2, double %acc) { +define double @f2(double %f1, double %f2, double %acc) #0 { ; CHECK-LABEL: f2: ; CHECK: wfnmsdb %f0, %f0, %f2, %f4 ; CHECK: br %r14 @@ -23,24 +23,24 @@ define double @f2(double %f1, double %f2, double %acc) { %res = call double @llvm.experimental.constrained.fma.f64 ( double %f1, double %f2, double %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negres = fsub double -0.0, %res ret double %negres } -define float @f3(float %f1, float %f2, float %acc) { +define float @f3(float %f1, float %f2, float %acc) #0 { ; CHECK-LABEL: f3: ; CHECK: wfnmasb %f0, %f0, %f2, %f4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %acc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negres = fsub float -0.0, %res ret float %negres } -define float @f4(float %f1, float %f2, float %acc) { +define float @f4(float %f1, float %f2, float %acc) #0 { ; CHECK-LABEL: f4: ; CHECK: wfnmssb %f0, %f0, %f2, %f4 ; CHECK: br %r14 @@ -48,8 +48,9 @@ define float @f4(float %f1, float %f2, float %acc) { %res = call float @llvm.experimental.constrained.fma.f32 ( float %f1, float %f2, float %negacc, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negres = fsub float -0.0, %res ret float %negres } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-11.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-11.ll index a8133ad2e3e954..58e5bc453e612f 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-11.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-11.ll @@ -4,7 +4,7 @@ declare fp128 @llvm.experimental.constrained.fmul.f128(fp128, fp128, metadata, metadata) -define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +define void @f1(fp128 *%ptr1, fp128 *%ptr2) #0 { ; CHECK-LABEL: f1: ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) ; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) @@ -16,12 +16,12 @@ define void @f1(fp128 *%ptr1, fp128 *%ptr2) { %sum = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1, fp128 %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %sum, fp128 *%ptr1 ret void } -define void @f2(double %f1, double %f2, fp128 *%dst) { +define void @f2(double %f1, double %f2, fp128 *%dst) #0 { ; CHECK-LABEL: f2: ; CHECK-DAG: wflld [[REG1:%v[0-9]+]], %f0 ; CHECK-DAG: wflld [[REG2:%v[0-9]+]], %f2 @@ -33,8 +33,9 @@ define void @f2(double %f1, double %f2, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.fmul.f128( fp128 %f1x, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%dst ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll index dfd2c07dca5e4f..92b5bdc65f598f 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-round-01.ll @@ -4,33 +4,33 @@ ; Test rint for f32. declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata) -define float @f1(float %f) { +define float @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: fiebr %f0, 0, %f0 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.rint.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test rint for f64. declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) -define double @f2(double %f) { +define double @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: fidbr %f0, 0, %f0 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.rint.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test rint for f128. declare fp128 @llvm.experimental.constrained.rint.f128(fp128, metadata, metadata) -define void @f3(fp128 *%ptr) { +define void @f3(fp128 *%ptr) #0 { ; CHECK-LABEL: f3: ; CHECK: fixbr %f0, 0, %f0 ; CHECK: br %r14 @@ -38,40 +38,40 @@ define void @f3(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.rint.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test nearbyint for f32. declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata) -define float @f4(float %f) { +define float @f4(float %f) #0 { ; CHECK-LABEL: f4: ; CHECK: brasl %r14, nearbyintf@PLT ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.nearbyint.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test nearbyint for f64. declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) -define double @f5(double %f) { +define double @f5(double %f) #0 { ; CHECK-LABEL: f5: ; CHECK: brasl %r14, nearbyint@PLT ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.nearbyint.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test nearbyint for f128. declare fp128 @llvm.experimental.constrained.nearbyint.f128(fp128, metadata, metadata) -define void @f6(fp128 *%ptr) { +define void @f6(fp128 *%ptr) #0 { ; CHECK-LABEL: f6: ; CHECK: brasl %r14, nearbyintl@PLT ; CHECK: br %r14 @@ -79,40 +79,40 @@ define void @f6(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.nearbyint.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test floor for f32. declare float @llvm.experimental.constrained.floor.f32(float, metadata, metadata) -define float @f7(float %f) { +define float @f7(float %f) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, floorf@PLT ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.floor.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test floor for f64. declare double @llvm.experimental.constrained.floor.f64(double, metadata, metadata) -define double @f8(double %f) { +define double @f8(double %f) #0 { ; CHECK-LABEL: f8: ; CHECK: brasl %r14, floor@PLT ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.floor.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test floor for f128. declare fp128 @llvm.experimental.constrained.floor.f128(fp128, metadata, metadata) -define void @f9(fp128 *%ptr) { +define void @f9(fp128 *%ptr) #0 { ; CHECK-LABEL: f9: ; CHECK: brasl %r14, floorl@PLT ; CHECK: br %r14 @@ -120,40 +120,40 @@ define void @f9(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.floor.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test ceil for f32. declare float @llvm.experimental.constrained.ceil.f32(float, metadata, metadata) -define float @f10(float %f) { +define float @f10(float %f) #0 { ; CHECK-LABEL: f10: ; CHECK: brasl %r14, ceilf@PLT ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.ceil.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test ceil for f64. declare double @llvm.experimental.constrained.ceil.f64(double, metadata, metadata) -define double @f11(double %f) { +define double @f11(double %f) #0 { ; CHECK-LABEL: f11: ; CHECK: brasl %r14, ceil@PLT ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.ceil.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test ceil for f128. declare fp128 @llvm.experimental.constrained.ceil.f128(fp128, metadata, metadata) -define void @f12(fp128 *%ptr) { +define void @f12(fp128 *%ptr) #0 { ; CHECK-LABEL: f12: ; CHECK: brasl %r14, ceill@PLT ; CHECK: br %r14 @@ -161,40 +161,40 @@ define void @f12(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.ceil.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test trunc for f32. declare float @llvm.experimental.constrained.trunc.f32(float, metadata, metadata) -define float @f13(float %f) { +define float @f13(float %f) #0 { ; CHECK-LABEL: f13: ; CHECK: brasl %r14, truncf@PLT ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.trunc.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test trunc for f64. declare double @llvm.experimental.constrained.trunc.f64(double, metadata, metadata) -define double @f14(double %f) { +define double @f14(double %f) #0 { ; CHECK-LABEL: f14: ; CHECK: brasl %r14, trunc@PLT ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.trunc.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test trunc for f128. declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata, metadata) -define void @f15(fp128 *%ptr) { +define void @f15(fp128 *%ptr) #0 { ; CHECK-LABEL: f15: ; CHECK: brasl %r14, truncl@PLT ; CHECK: br %r14 @@ -202,40 +202,40 @@ define void @f15(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.trunc.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test round for f32. declare float @llvm.experimental.constrained.round.f32(float, metadata, metadata) -define float @f16(float %f) { +define float @f16(float %f) #0 { ; CHECK-LABEL: f16: ; CHECK: brasl %r14, roundf@PLT ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.round.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test round for f64. declare double @llvm.experimental.constrained.round.f64(double, metadata, metadata) -define double @f17(double %f) { +define double @f17(double %f) #0 { ; CHECK-LABEL: f17: ; CHECK: brasl %r14, round@PLT ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.round.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test round for f128. declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata, metadata) -define void @f18(fp128 *%ptr) { +define void @f18(fp128 *%ptr) #0 { ; CHECK-LABEL: f18: ; CHECK: brasl %r14, roundl@PLT ; CHECK: br %r14 @@ -243,8 +243,9 @@ define void @f18(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.round.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll index fe1be5769939bc..223e1a076bb8c0 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-round-02.ll @@ -7,20 +7,20 @@ ; Test rint for f32. declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata) -define float @f1(float %f) { +define float @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: fiebr %f0, 0, %f0 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.rint.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test rint for f64. declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) -define double @f2(double %f) { +define double @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK-SCALAR: fidbr %f0, 0, %f0 ; CHECK-VECTOR: fidbra %f0, 0, %f0, 0 @@ -28,13 +28,13 @@ define double @f2(double %f) { %res = call double @llvm.experimental.constrained.rint.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test rint for f128. declare fp128 @llvm.experimental.constrained.rint.f128(fp128, metadata, metadata) -define void @f3(fp128 *%ptr) { +define void @f3(fp128 *%ptr) #0 { ; CHECK-LABEL: f3: ; CHECK: fixbr %f0, 0, %f0 ; CHECK: br %r14 @@ -42,40 +42,40 @@ define void @f3(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.rint.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test nearbyint for f32. declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata) -define float @f4(float %f) { +define float @f4(float %f) #0 { ; CHECK-LABEL: f4: ; CHECK: fiebra %f0, 0, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.nearbyint.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test nearbyint for f64. declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) -define double @f5(double %f) { +define double @f5(double %f) #0 { ; CHECK-LABEL: f5: ; CHECK: fidbra %f0, 0, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.nearbyint.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test nearbyint for f128. declare fp128 @llvm.experimental.constrained.nearbyint.f128(fp128, metadata, metadata) -define void @f6(fp128 *%ptr) { +define void @f6(fp128 *%ptr) #0 { ; CHECK-LABEL: f6: ; CHECK: fixbra %f0, 0, %f0, 4 ; CHECK: br %r14 @@ -83,40 +83,40 @@ define void @f6(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.nearbyint.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test floor for f32. declare float @llvm.experimental.constrained.floor.f32(float, metadata, metadata) -define float @f7(float %f) { +define float @f7(float %f) #0 { ; CHECK-LABEL: f7: ; CHECK: fiebra %f0, 7, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.floor.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test floor for f64. declare double @llvm.experimental.constrained.floor.f64(double, metadata, metadata) -define double @f8(double %f) { +define double @f8(double %f) #0 { ; CHECK-LABEL: f8: ; CHECK: fidbra %f0, 7, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.floor.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test floor for f128. declare fp128 @llvm.experimental.constrained.floor.f128(fp128, metadata, metadata) -define void @f9(fp128 *%ptr) { +define void @f9(fp128 *%ptr) #0 { ; CHECK-LABEL: f9: ; CHECK: fixbra %f0, 7, %f0, 4 ; CHECK: br %r14 @@ -124,40 +124,40 @@ define void @f9(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.floor.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test ceil for f32. declare float @llvm.experimental.constrained.ceil.f32(float, metadata, metadata) -define float @f10(float %f) { +define float @f10(float %f) #0 { ; CHECK-LABEL: f10: ; CHECK: fiebra %f0, 6, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.ceil.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test ceil for f64. declare double @llvm.experimental.constrained.ceil.f64(double, metadata, metadata) -define double @f11(double %f) { +define double @f11(double %f) #0 { ; CHECK-LABEL: f11: ; CHECK: fidbra %f0, 6, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.ceil.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test ceil for f128. declare fp128 @llvm.experimental.constrained.ceil.f128(fp128, metadata, metadata) -define void @f12(fp128 *%ptr) { +define void @f12(fp128 *%ptr) #0 { ; CHECK-LABEL: f12: ; CHECK: fixbra %f0, 6, %f0, 4 ; CHECK: br %r14 @@ -165,40 +165,40 @@ define void @f12(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.ceil.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test trunc for f32. declare float @llvm.experimental.constrained.trunc.f32(float, metadata, metadata) -define float @f13(float %f) { +define float @f13(float %f) #0 { ; CHECK-LABEL: f13: ; CHECK: fiebra %f0, 5, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.trunc.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test trunc for f64. declare double @llvm.experimental.constrained.trunc.f64(double, metadata, metadata) -define double @f14(double %f) { +define double @f14(double %f) #0 { ; CHECK-LABEL: f14: ; CHECK: fidbra %f0, 5, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.trunc.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test trunc for f128. declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata, metadata) -define void @f15(fp128 *%ptr) { +define void @f15(fp128 *%ptr) #0 { ; CHECK-LABEL: f15: ; CHECK: fixbra %f0, 5, %f0, 4 ; CHECK: br %r14 @@ -206,40 +206,40 @@ define void @f15(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.trunc.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test round for f32. declare float @llvm.experimental.constrained.round.f32(float, metadata, metadata) -define float @f16(float %f) { +define float @f16(float %f) #0 { ; CHECK-LABEL: f16: ; CHECK: fiebra %f0, 1, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.round.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test round for f64. declare double @llvm.experimental.constrained.round.f64(double, metadata, metadata) -define double @f17(double %f) { +define double @f17(double %f) #0 { ; CHECK-LABEL: f17: ; CHECK: fidbra %f0, 1, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.round.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test round for f128. declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata, metadata) -define void @f18(fp128 *%ptr) { +define void @f18(fp128 *%ptr) #0 { ; CHECK-LABEL: f18: ; CHECK: fixbra %f0, 1, %f0, 4 ; CHECK: br %r14 @@ -247,8 +247,9 @@ define void @f18(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.round.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll index e1f634b9ad4738..811fe8340f1ef0 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-round-03.ll @@ -4,33 +4,33 @@ ; Test rint for f32. declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata) -define float @f1(float %f) { +define float @f1(float %f) #0 { ; CHECK-LABEL: f1: ; CHECK: fiebra %f0, 0, %f0, 0 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.rint.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test rint for f64. declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata) -define double @f2(double %f) { +define double @f2(double %f) #0 { ; CHECK-LABEL: f2: ; CHECK: fidbra %f0, 0, %f0, 0 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.rint.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test rint for f128. declare fp128 @llvm.experimental.constrained.rint.f128(fp128, metadata, metadata) -define void @f3(fp128 *%ptr) { +define void @f3(fp128 *%ptr) #0 { ; CHECK-LABEL: f3: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 0, 0 @@ -40,40 +40,40 @@ define void @f3(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.rint.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test nearbyint for f32. declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata) -define float @f4(float %f) { +define float @f4(float %f) #0 { ; CHECK-LABEL: f4: ; CHECK: fiebra %f0, 0, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.nearbyint.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test nearbyint for f64. declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata) -define double @f5(double %f) { +define double @f5(double %f) #0 { ; CHECK-LABEL: f5: ; CHECK: fidbra %f0, 0, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.nearbyint.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test nearbyint for f128. declare fp128 @llvm.experimental.constrained.nearbyint.f128(fp128, metadata, metadata) -define void @f6(fp128 *%ptr) { +define void @f6(fp128 *%ptr) #0 { ; CHECK-LABEL: f6: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 0 @@ -83,40 +83,40 @@ define void @f6(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.nearbyint.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test floor for f32. declare float @llvm.experimental.constrained.floor.f32(float, metadata, metadata) -define float @f7(float %f) { +define float @f7(float %f) #0 { ; CHECK-LABEL: f7: ; CHECK: fiebra %f0, 7, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.floor.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test floor for f64. declare double @llvm.experimental.constrained.floor.f64(double, metadata, metadata) -define double @f8(double %f) { +define double @f8(double %f) #0 { ; CHECK-LABEL: f8: ; CHECK: fidbra %f0, 7, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.floor.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test floor for f128. declare fp128 @llvm.experimental.constrained.floor.f128(fp128, metadata, metadata) -define void @f9(fp128 *%ptr) { +define void @f9(fp128 *%ptr) #0 { ; CHECK-LABEL: f9: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 7 @@ -126,40 +126,40 @@ define void @f9(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.floor.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test ceil for f32. declare float @llvm.experimental.constrained.ceil.f32(float, metadata, metadata) -define float @f10(float %f) { +define float @f10(float %f) #0 { ; CHECK-LABEL: f10: ; CHECK: fiebra %f0, 6, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.ceil.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test ceil for f64. declare double @llvm.experimental.constrained.ceil.f64(double, metadata, metadata) -define double @f11(double %f) { +define double @f11(double %f) #0 { ; CHECK-LABEL: f11: ; CHECK: fidbra %f0, 6, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.ceil.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test ceil for f128. declare fp128 @llvm.experimental.constrained.ceil.f128(fp128, metadata, metadata) -define void @f12(fp128 *%ptr) { +define void @f12(fp128 *%ptr) #0 { ; CHECK-LABEL: f12: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 6 @@ -169,40 +169,40 @@ define void @f12(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.ceil.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test trunc for f32. declare float @llvm.experimental.constrained.trunc.f32(float, metadata, metadata) -define float @f13(float %f) { +define float @f13(float %f) #0 { ; CHECK-LABEL: f13: ; CHECK: fiebra %f0, 5, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.trunc.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test trunc for f64. declare double @llvm.experimental.constrained.trunc.f64(double, metadata, metadata) -define double @f14(double %f) { +define double @f14(double %f) #0 { ; CHECK-LABEL: f14: ; CHECK: fidbra %f0, 5, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.trunc.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test trunc for f128. declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata, metadata) -define void @f15(fp128 *%ptr) { +define void @f15(fp128 *%ptr) #0 { ; CHECK-LABEL: f15: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 5 @@ -212,40 +212,40 @@ define void @f15(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.trunc.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } ; Test round for f32. declare float @llvm.experimental.constrained.round.f32(float, metadata, metadata) -define float @f16(float %f) { +define float @f16(float %f) #0 { ; CHECK-LABEL: f16: ; CHECK: fiebra %f0, 1, %f0, 4 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.round.f32( float %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Test round for f64. declare double @llvm.experimental.constrained.round.f64(double, metadata, metadata) -define double @f17(double %f) { +define double @f17(double %f) #0 { ; CHECK-LABEL: f17: ; CHECK: fidbra %f0, 1, %f0, 4 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.round.f64( double %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Test round for f128. declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata, metadata) -define void @f18(fp128 *%ptr) { +define void @f18(fp128 *%ptr) #0 { ; CHECK-LABEL: f18: ; CHECK: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wfixb [[RES:%v[0-9]+]], [[REG]], 4, 1 @@ -255,8 +255,9 @@ define void @f18(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.round.f128( fp128 %src, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128 *%ptr ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll index 158308c49be778..cff83950617c38 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-01.ll @@ -7,19 +7,19 @@ declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) ; Check register square root. -define float @f1(float %val) { +define float @f1(float %val) #0 { ; CHECK-LABEL: f1: ; CHECK: sqebr %f0, %f0 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.sqrt.f32( float %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the low end of the SQEB range. -define float @f2(float *%ptr) { +define float @f2(float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: sqeb %f0, 0(%r2) ; CHECK: br %r14 @@ -27,12 +27,12 @@ define float @f2(float *%ptr) { %res = call float @llvm.experimental.constrained.sqrt.f32( float %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the high end of the aligned SQEB range. -define float @f3(float *%base) { +define float @f3(float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: sqeb %f0, 4092(%r2) ; CHECK: br %r14 @@ -41,13 +41,13 @@ define float @f3(float *%base) { %res = call float @llvm.experimental.constrained.sqrt.f32( float %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define float @f4(float *%base) { +define float @f4(float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: sqeb %f0, 0(%r2) @@ -57,12 +57,12 @@ define float @f4(float *%base) { %res = call float @llvm.experimental.constrained.sqrt.f32( float %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check negative displacements, which also need separate address logic. -define float @f5(float *%base) { +define float @f5(float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: sqeb %f0, 0(%r2) @@ -72,12 +72,12 @@ define float @f5(float *%base) { %res = call float @llvm.experimental.constrained.sqrt.f32( float %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that SQEB allows indices. -define float @f6(float *%base, i64 %index) { +define float @f6(float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: sqeb %f0, 400(%r1,%r2) @@ -88,7 +88,8 @@ define float @f6(float *%base, i64 %index) { %res = call float @llvm.experimental.constrained.sqrt.f32( float %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-02.ll index 4e90939d2e73a1..791c39301e4818 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-02.ll @@ -7,19 +7,19 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) ; Check register square root. -define double @f1(double %val) { +define double @f1(double %val) #0 { ; CHECK-LABEL: f1: ; CHECK: sqdbr %f0, %f0 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.sqrt.f64( double %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the low end of the SQDB range. -define double @f2(double *%ptr) { +define double @f2(double *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: sqdb %f0, 0(%r2) ; CHECK: br %r14 @@ -27,12 +27,12 @@ define double @f2(double *%ptr) { %res = call double @llvm.experimental.constrained.sqrt.f64( double %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the high end of the aligned SQDB range. -define double @f3(double *%base) { +define double @f3(double *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: sqdb %f0, 4088(%r2) ; CHECK: br %r14 @@ -41,13 +41,13 @@ define double @f3(double *%base) { %res = call double @llvm.experimental.constrained.sqrt.f64( double %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the next doubleword up, which needs separate address logic. ; Other sequences besides this one would be OK. -define double @f4(double *%base) { +define double @f4(double *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: sqdb %f0, 0(%r2) @@ -57,12 +57,12 @@ define double @f4(double *%base) { %res = call double @llvm.experimental.constrained.sqrt.f64( double %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check negative displacements, which also need separate address logic. -define double @f5(double *%base) { +define double @f5(double *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -8 ; CHECK: sqdb %f0, 0(%r2) @@ -72,12 +72,12 @@ define double @f5(double *%base) { %res = call double @llvm.experimental.constrained.sqrt.f64( double %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that SQDB allows indices. -define double @f6(double *%base, i64 %index) { +define double @f6(double *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: sqdb %f0, 800(%r1,%r2) @@ -88,7 +88,8 @@ define double @f6(double *%base, i64 %index) { %res = call double @llvm.experimental.constrained.sqrt.f64( double %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-03.ll index 9bcceb74f712fa..0f2f2729362be9 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-03.ll @@ -5,7 +5,7 @@ declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata) ; There's no memory form of SQXBR. -define void @f1(fp128 *%ptr) { +define void @f1(fp128 *%ptr) strictfp { ; CHECK-LABEL: f1: ; CHECK: ld %f0, 0(%r2) ; CHECK: ld %f2, 8(%r2) @@ -17,7 +17,7 @@ define void @f1(fp128 *%ptr) { %sqrt = call fp128 @llvm.experimental.constrained.sqrt.f128( fp128 %orig, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %sqrt, fp128 *%ptr ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-04.ll index f24f958bcb05e8..0667aeb281fb76 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sqrt-04.ll @@ -4,7 +4,7 @@ declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata) -define void @f1(fp128 *%ptr) { +define void @f1(fp128 *%ptr) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: vl [[REG:%v[0-9]+]], 0(%r2) ; CHECK: wfsqxb [[RES:%v[0-9]+]], [[REG]] @@ -14,7 +14,7 @@ define void @f1(fp128 *%ptr) { %res = call fp128 @llvm.experimental.constrained.sqrt.f128( fp128 %f, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %res, fp128 *%ptr ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll index a4c485a2f9506d..82156e4856b5ad 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sub-01.ll @@ -8,19 +8,19 @@ declare float @foo() declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) ; Check register subtraction. -define float @f1(float %f1, float %f2) { +define float @f1(float %f1, float %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: sebr %f0, %f2 ; CHECK: br %r14 %res = call float @llvm.experimental.constrained.fsub.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the low end of the SEB range. -define float @f2(float %f1, float *%ptr) { +define float @f2(float %f1, float *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: seb %f0, 0(%r2) ; CHECK: br %r14 @@ -28,12 +28,12 @@ define float @f2(float %f1, float *%ptr) { %res = call float @llvm.experimental.constrained.fsub.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the high end of the aligned SEB range. -define float @f3(float %f1, float *%base) { +define float @f3(float %f1, float *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: seb %f0, 4092(%r2) ; CHECK: br %r14 @@ -42,13 +42,13 @@ define float @f3(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fsub.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check the next word up, which needs separate address logic. ; Other sequences besides this one would be OK. -define float @f4(float %f1, float *%base) { +define float @f4(float %f1, float *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: seb %f0, 0(%r2) @@ -58,12 +58,12 @@ define float @f4(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fsub.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check negative displacements, which also need separate address logic. -define float @f5(float %f1, float *%base) { +define float @f5(float %f1, float *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -4 ; CHECK: seb %f0, 0(%r2) @@ -73,12 +73,12 @@ define float @f5(float %f1, float *%base) { %res = call float @llvm.experimental.constrained.fsub.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that SEB allows indices. -define float @f6(float %f1, float *%base, i64 %index) { +define float @f6(float %f1, float *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 2 ; CHECK: seb %f0, 400(%r1,%r2) @@ -89,12 +89,12 @@ define float @f6(float %f1, float *%base, i64 %index) { %res = call float @llvm.experimental.constrained.fsub.f32( float %f1, float %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } ; Check that subtractions of spilled values can use SEB rather than SEBR. -define float @f7(float *%ptr0) { +define float @f7(float *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: seb %f0, 16{{[04]}}(%r15) @@ -122,52 +122,54 @@ define float @f7(float *%ptr0) { %val9 = load float, float *%ptr9 %val10 = load float, float *%ptr10 - %ret = call float @foo() + %ret = call float @foo() #0 %sub0 = call float @llvm.experimental.constrained.fsub.f32( float %ret, float %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub1 = call float @llvm.experimental.constrained.fsub.f32( float %sub0, float %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub2 = call float @llvm.experimental.constrained.fsub.f32( float %sub1, float %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub3 = call float @llvm.experimental.constrained.fsub.f32( float %sub2, float %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub4 = call float @llvm.experimental.constrained.fsub.f32( float %sub3, float %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub5 = call float @llvm.experimental.constrained.fsub.f32( float %sub4, float %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub6 = call float @llvm.experimental.constrained.fsub.f32( float %sub5, float %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub7 = call float @llvm.experimental.constrained.fsub.f32( float %sub6, float %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub8 = call float @llvm.experimental.constrained.fsub.f32( float %sub7, float %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub9 = call float @llvm.experimental.constrained.fsub.f32( float %sub8, float %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub10 = call float @llvm.experimental.constrained.fsub.f32( float %sub9, float %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %sub10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sub-02.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sub-02.ll index 0d3cdd35103824..6184d88d83a77b 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sub-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sub-02.ll @@ -8,19 +8,19 @@ declare double @foo() declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) ; Check register subtraction. -define double @f1(double %f1, double %f2) { +define double @f1(double %f1, double %f2) #0 { ; CHECK-LABEL: f1: ; CHECK: sdbr %f0, %f2 ; CHECK: br %r14 %res = call double @llvm.experimental.constrained.fsub.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the low end of the SDB range. -define double @f2(double %f1, double *%ptr) { +define double @f2(double %f1, double *%ptr) #0 { ; CHECK-LABEL: f2: ; CHECK: sdb %f0, 0(%r2) ; CHECK: br %r14 @@ -28,12 +28,12 @@ define double @f2(double %f1, double *%ptr) { %res = call double @llvm.experimental.constrained.fsub.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the high end of the aligned SDB range. -define double @f3(double %f1, double *%base) { +define double @f3(double %f1, double *%base) #0 { ; CHECK-LABEL: f3: ; CHECK: sdb %f0, 4088(%r2) ; CHECK: br %r14 @@ -42,13 +42,13 @@ define double @f3(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fsub.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check the next doubleword up, which needs separate address logic. ; Other sequences besides this one would be OK. -define double @f4(double %f1, double *%base) { +define double @f4(double %f1, double *%base) #0 { ; CHECK-LABEL: f4: ; CHECK: aghi %r2, 4096 ; CHECK: sdb %f0, 0(%r2) @@ -58,12 +58,12 @@ define double @f4(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fsub.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check negative displacements, which also need separate address logic. -define double @f5(double %f1, double *%base) { +define double @f5(double %f1, double *%base) #0 { ; CHECK-LABEL: f5: ; CHECK: aghi %r2, -8 ; CHECK: sdb %f0, 0(%r2) @@ -73,12 +73,12 @@ define double @f5(double %f1, double *%base) { %res = call double @llvm.experimental.constrained.fsub.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that SDB allows indices. -define double @f6(double %f1, double *%base, i64 %index) { +define double @f6(double %f1, double *%base, i64 %index) #0 { ; CHECK-LABEL: f6: ; CHECK: sllg %r1, %r3, 3 ; CHECK: sdb %f0, 800(%r1,%r2) @@ -89,12 +89,12 @@ define double @f6(double %f1, double *%base, i64 %index) { %res = call double @llvm.experimental.constrained.fsub.f64( double %f1, double %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } ; Check that subtractions of spilled values can use SDB rather than SDBR. -define double @f7(double *%ptr0) { +define double @f7(double *%ptr0) #0 { ; CHECK-LABEL: f7: ; CHECK: brasl %r14, foo@PLT ; CHECK-SCALAR: sdb %f0, 16{{[04]}}(%r15) @@ -122,52 +122,54 @@ define double @f7(double *%ptr0) { %val9 = load double, double *%ptr9 %val10 = load double, double *%ptr10 - %ret = call double @foo() + %ret = call double @foo() #0 %sub0 = call double @llvm.experimental.constrained.fsub.f64( double %ret, double %val0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub1 = call double @llvm.experimental.constrained.fsub.f64( double %sub0, double %val1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub2 = call double @llvm.experimental.constrained.fsub.f64( double %sub1, double %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub3 = call double @llvm.experimental.constrained.fsub.f64( double %sub2, double %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub4 = call double @llvm.experimental.constrained.fsub.f64( double %sub3, double %val4, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub5 = call double @llvm.experimental.constrained.fsub.f64( double %sub4, double %val5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub6 = call double @llvm.experimental.constrained.fsub.f64( double %sub5, double %val6, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub7 = call double @llvm.experimental.constrained.fsub.f64( double %sub6, double %val7, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub8 = call double @llvm.experimental.constrained.fsub.f64( double %sub7, double %val8, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub9 = call double @llvm.experimental.constrained.fsub.f64( double %sub8, double %val9, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %sub10 = call double @llvm.experimental.constrained.fsub.f64( double %sub9, double %val10, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %sub10 } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sub-03.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sub-03.ll index 63bb2fdd5bcc2a..cc3ee09e3a242c 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sub-03.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sub-03.ll @@ -5,7 +5,7 @@ declare fp128 @llvm.experimental.constrained.fsub.f128(fp128, fp128, metadata, metadata) ; There is no memory form of 128-bit subtraction. -define void @f1(fp128 *%ptr, float %f2) { +define void @f1(fp128 *%ptr, float %f2) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: lxebr %f0, %f0 ; CHECK-DAG: ld %f1, 0(%r2) @@ -19,7 +19,7 @@ define void @f1(fp128 *%ptr, float %f2) { %sum = call fp128 @llvm.experimental.constrained.fsub.f128( fp128 %f1, fp128 %f2x, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %sum, fp128 *%ptr ret void } diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-sub-04.ll b/llvm/test/CodeGen/SystemZ/fp-strict-sub-04.ll index 1e8326847f2499..0eaf5f3afef208 100644 --- a/llvm/test/CodeGen/SystemZ/fp-strict-sub-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-strict-sub-04.ll @@ -4,7 +4,7 @@ declare fp128 @llvm.experimental.constrained.fsub.f128(fp128, fp128, metadata, metadata) -define void @f1(fp128 *%ptr1, fp128 *%ptr2) { +define void @f1(fp128 *%ptr1, fp128 *%ptr2) strictfp { ; CHECK-LABEL: f1: ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) ; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) @@ -16,7 +16,7 @@ define void @f1(fp128 *%ptr1, fp128 *%ptr2) { %sum = call fp128 @llvm.experimental.constrained.fsub.f128( fp128 %f1, fp128 %f2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp store fp128 %sum, fp128 *%ptr1 ret void } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-add-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-add-01.ll index d1270b9d2a03a5..1ac2e190d1dd94 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-add-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-add-01.ll @@ -7,19 +7,19 @@ declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 ; Test a v2f64 addition. define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2) { + <2 x double> %val2) strictfp { ; CHECK-LABEL: f5: ; CHECK: vfadb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <2 x double> @llvm.experimental.constrained.fadd.v2f64( <2 x double> %val1, <2 x double> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp ret <2 x double> %ret } ; Test an f64 addition that uses vector registers. -define double @f6(<2 x double> %val1, <2 x double> %val2) { +define double @f6(<2 x double> %val1, <2 x double> %val2) strictfp { ; CHECK-LABEL: f6: ; CHECK: wfadb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,6 +28,6 @@ define double @f6(<2 x double> %val1, <2 x double> %val2) { %ret = call double @llvm.experimental.constrained.fadd.f64( double %scalar1, double %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp ret double %ret } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-add-02.ll b/llvm/test/CodeGen/SystemZ/vec-strict-add-02.ll index 4aee31aee45f7b..7cdd6383178d79 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-add-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-add-02.ll @@ -7,19 +7,19 @@ declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x ; Test a v4f32 addition. define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2) { + <4 x float> %val2) strictfp { ; CHECK-LABEL: f1: ; CHECK: vfasb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <4 x float> @llvm.experimental.constrained.fadd.v4f32( <4 x float> %val1, <4 x float> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp ret <4 x float> %ret } ; Test an f32 addition that uses vector registers. -define float @f2(<4 x float> %val1, <4 x float> %val2) { +define float @f2(<4 x float> %val1, <4 x float> %val2) strictfp { ; CHECK-LABEL: f2: ; CHECK: wfasb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,6 +28,6 @@ define float @f2(<4 x float> %val1, <4 x float> %val2) { %ret = call float @llvm.experimental.constrained.fadd.f32( float %scalar1, float %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") strictfp ret float %ret } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-conv-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-conv-01.ll index a5fa0066b144be..045e2b2344cd68 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-conv-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-conv-01.ll @@ -14,54 +14,55 @@ declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(<2 x float>, declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(<2 x float>, metadata) ; Test conversion of f64s to signed i64s. -define <2 x i64> @f1(<2 x double> %doubles) { +define <2 x i64> @f1(<2 x double> %doubles) #0 { ; CHECK-LABEL: f1: ; CHECK: vcgdb %v24, %v24, 0, 5 ; CHECK: br %r14 %dwords = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double> %doubles, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %dwords } ; Test conversion of f64s to unsigned i64s. -define <2 x i64> @f2(<2 x double> %doubles) { +define <2 x i64> @f2(<2 x double> %doubles) #0 { ; CHECK-LABEL: f2: ; CHECK: vclgdb %v24, %v24, 0, 5 ; CHECK: br %r14 %dwords = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double> %doubles, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %dwords } ; Test conversion of f64s to signed i32s, which must compile. -define void @f5(<2 x double> %doubles, <2 x i32> *%ptr) { +define void @f5(<2 x double> %doubles, <2 x i32> *%ptr) #0 { %words = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(<2 x double> %doubles, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <2 x i32> %words, <2 x i32> *%ptr ret void } ; Test conversion of f64s to unsigned i32s, which must compile. -define void @f6(<2 x double> %doubles, <2 x i32> *%ptr) { +define void @f6(<2 x double> %doubles, <2 x i32> *%ptr) #0 { %words = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(<2 x double> %doubles, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <2 x i32> %words, <2 x i32> *%ptr ret void } ; Test conversion of f32s to signed i64s, which must compile. -define <2 x i64> @f9(<2 x float> *%ptr) { +define <2 x i64> @f9(<2 x float> *%ptr) #0 { %floats = load <2 x float>, <2 x float> *%ptr %dwords = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(<2 x float> %floats, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %dwords } ; Test conversion of f32s to unsigned i64s, which must compile. -define <2 x i64> @f10(<2 x float> *%ptr) { +define <2 x i64> @f10(<2 x float> *%ptr) #0 { %floats = load <2 x float>, <2 x float> *%ptr %dwords = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(<2 x float> %floats, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %dwords } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-conv-03.ll b/llvm/test/CodeGen/SystemZ/vec-strict-conv-03.ll index 6b9bc80cbdeb5d..ace2e1ec42e3c3 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-conv-03.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-conv-03.ll @@ -8,22 +8,23 @@ declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float>, declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float>, metadata) ; Test conversion of f32s to signed i32s. -define <4 x i32> @f1(<4 x float> %floats) { +define <4 x i32> @f1(<4 x float> %floats) #0 { ; CHECK-LABEL: f1: ; CHECK: vcfeb %v24, %v24, 0, 5 ; CHECK: br %r14 %words = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float> %floats, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i32> %words } ; Test conversion of f32s to unsigned i32s. -define <4 x i32> @f2(<4 x float> %floats) { +define <4 x i32> @f2(<4 x float> %floats) #0 { ; CHECK-LABEL: f2: ; CHECK: vclfeb %v24, %v24, 0, 5 ; CHECK: br %r14 %words = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float> %floats, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i32> %words } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-div-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-div-01.ll index ec54776254a83c..bb15c0d40ced41 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-div-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-div-01.ll @@ -7,19 +7,19 @@ declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 ; Test a v2f64 division. define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2) { + <2 x double> %val2) #0 { ; CHECK-LABEL: f5: ; CHECK: vfddb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64( <2 x double> %val1, <2 x double> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } ; Test an f64 division that uses vector registers. -define double @f6(<2 x double> %val1, <2 x double> %val2) { +define double @f6(<2 x double> %val1, <2 x double> %val2) #0 { ; CHECK-LABEL: f6: ; CHECK: wfddb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,6 +28,8 @@ define double @f6(<2 x double> %val1, <2 x double> %val2) { %ret = call double @llvm.experimental.constrained.fdiv.f64( double %scalar1, double %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-div-02.ll b/llvm/test/CodeGen/SystemZ/vec-strict-div-02.ll index 0fce46295eb681..b791d67b3244e5 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-div-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-div-02.ll @@ -7,19 +7,19 @@ declare <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float>, <4 x ; Test a v4f32 division. define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2) { + <4 x float> %val2) #0 { ; CHECK-LABEL: f1: ; CHECK: vfdsb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32( <4 x float> %val1, <4 x float> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } ; Test an f32 division that uses vector registers. -define float @f2(<4 x float> %val1, <4 x float> %val2) { +define float @f2(<4 x float> %val1, <4 x float> %val2) #0 { ; CHECK-LABEL: f2: ; CHECK: wfdsb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,6 +28,8 @@ define float @f2(<4 x float> %val1, <4 x float> %val2) { %ret = call float @llvm.experimental.constrained.fdiv.f32( float %scalar1, float %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll index 82e7c32c0ef15e..c734a6aa5fee00 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-max-01.ll @@ -11,57 +11,57 @@ declare <4 x float> @llvm.experimental.constrained.maxnum.v4f32(<4 x float>, <4 declare fp128 @llvm.experimental.constrained.maxnum.f128(fp128, fp128, metadata, metadata) ; Test the f64 maxnum intrinsic. -define double @f1(double %dummy, double %val1, double %val2) { +define double @f1(double %dummy, double %val1, double %val2) #0 { ; CHECK-LABEL: f1: ; CHECK: wfmaxdb %f0, %f2, %f4, 4 ; CHECK: br %r14 %ret = call double @llvm.experimental.constrained.maxnum.f64( double %val1, double %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } ; Test the v2f64 maxnum intrinsic. define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2) { + <2 x double> %val2) #0 { ; CHECK-LABEL: f2: ; CHECK: vfmaxdb %v24, %v26, %v28, 4 ; CHECK: br %r14 %ret = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64( <2 x double> %val1, <2 x double> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } ; Test the f32 maxnum intrinsic. -define float @f3(float %dummy, float %val1, float %val2) { +define float @f3(float %dummy, float %val1, float %val2) #0 { ; CHECK-LABEL: f3: ; CHECK: wfmaxsb %f0, %f2, %f4, 4 ; CHECK: br %r14 %ret = call float @llvm.experimental.constrained.maxnum.f32( float %val1, float %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %ret } ; Test the v4f32 maxnum intrinsic. define <4 x float> @f4(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2) { + <4 x float> %val2) #0 { ; CHECK-LABEL: f4: ; CHECK: vfmaxsb %v24, %v26, %v28, 4 ; CHECK: br %r14 %ret = call <4 x float> @llvm.experimental.constrained.maxnum.v4f32( <4 x float> %val1, <4 x float> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } ; Test the f128 maxnum intrinsic. -define void @f5(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { +define void @f5(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) #0 { ; CHECK-LABEL: f5: ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) ; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) @@ -73,8 +73,9 @@ define void @f5(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.maxnum.f128( fp128 %val1, fp128 %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128* %dst ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll index 641b9c33475aaf..25882568bdc3f5 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-min-01.ll @@ -11,57 +11,57 @@ declare <4 x float> @llvm.experimental.constrained.minnum.v4f32(<4 x float>, <4 declare fp128 @llvm.experimental.constrained.minnum.f128(fp128, fp128, metadata, metadata) ; Test the f64 minnum intrinsic. -define double @f1(double %dummy, double %val1, double %val2) { +define double @f1(double %dummy, double %val1, double %val2) #0 { ; CHECK-LABEL: f1: ; CHECK: wfmindb %f0, %f2, %f4, 4 ; CHECK: br %r14 %ret = call double @llvm.experimental.constrained.minnum.f64( double %val1, double %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } ; Test the v2f64 minnum intrinsic. define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2) { + <2 x double> %val2) #0 { ; CHECK-LABEL: f2: ; CHECK: vfmindb %v24, %v26, %v28, 4 ; CHECK: br %r14 %ret = call <2 x double> @llvm.experimental.constrained.minnum.v2f64( <2 x double> %val1, <2 x double> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } ; Test the f32 minnum intrinsic. -define float @f3(float %dummy, float %val1, float %val2) { +define float @f3(float %dummy, float %val1, float %val2) #0 { ; CHECK-LABEL: f3: ; CHECK: wfminsb %f0, %f2, %f4, 4 ; CHECK: br %r14 %ret = call float @llvm.experimental.constrained.minnum.f32( float %val1, float %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %ret } ; Test the v4f32 minnum intrinsic. define <4 x float> @f4(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2) { + <4 x float> %val2) #0 { ; CHECK-LABEL: f4: ; CHECK: vfminsb %v24, %v26, %v28, 4 ; CHECK: br %r14 %ret = call <4 x float> @llvm.experimental.constrained.minnum.v4f32( <4 x float> %val1, <4 x float> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } ; Test the f128 minnum intrinsic. -define void @f5(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { +define void @f5(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) #0 { ; CHECK-LABEL: f5: ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2) ; CHECK-DAG: vl [[REG2:%v[0-9]+]], 0(%r3) @@ -73,8 +73,9 @@ define void @f5(fp128 *%ptr1, fp128 *%ptr2, fp128 *%dst) { %res = call fp128 @llvm.experimental.constrained.minnum.f128( fp128 %val1, fp128 %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store fp128 %res, fp128* %dst ret void } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-mul-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-mul-01.ll index 0bb10188bacf45..52c40f4cb73e75 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-mul-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-mul-01.ll @@ -7,19 +7,19 @@ declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 ; Test a v2f64 multiplication. define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2) { + <2 x double> %val2) #0 { ; CHECK-LABEL: f5: ; CHECK: vfmdb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <2 x double> @llvm.experimental.constrained.fmul.v2f64( <2 x double> %val1, <2 x double> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } ; Test an f64 multiplication that uses vector registers. -define double @f6(<2 x double> %val1, <2 x double> %val2) { +define double @f6(<2 x double> %val1, <2 x double> %val2) #0 { ; CHECK-LABEL: f6: ; CHECK: wfmdb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,6 +28,8 @@ define double @f6(<2 x double> %val1, <2 x double> %val2) { %ret = call double @llvm.experimental.constrained.fmul.f64( double %scalar1, double %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-mul-02.ll b/llvm/test/CodeGen/SystemZ/vec-strict-mul-02.ll index 61447aa9b0d11f..fc9c1575952c0e 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-mul-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-mul-02.ll @@ -6,7 +6,7 @@ declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x ; Test a v2f64 multiply-and-add. define <2 x double> @f4(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2, <2 x double> %val3) { + <2 x double> %val2, <2 x double> %val3) #0 { ; CHECK-LABEL: f4: ; CHECK: vfmadb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -15,13 +15,13 @@ define <2 x double> @f4(<2 x double> %dummy, <2 x double> %val1, <2 x double> %val2, <2 x double> %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } ; Test a v2f64 multiply-and-subtract. define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2, <2 x double> %val3) { + <2 x double> %val2, <2 x double> %val3) #0 { ; CHECK-LABEL: f5: ; CHECK: vfmsdb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -31,6 +31,8 @@ define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1, <2 x double> %val2, <2 x double> %negval3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-mul-03.ll b/llvm/test/CodeGen/SystemZ/vec-strict-mul-03.ll index a61d55913a1784..a05ed27d4c65ae 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-mul-03.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-mul-03.ll @@ -7,19 +7,19 @@ declare <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float>, <4 x ; Test a v4f32 multiplication. define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2) { + <4 x float> %val2) #0 { ; CHECK-LABEL: f1: ; CHECK: vfmsb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <4 x float> @llvm.experimental.constrained.fmul.v4f32( <4 x float> %val1, <4 x float> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } ; Test an f32 multiplication that uses vector registers. -define float @f2(<4 x float> %val1, <4 x float> %val2) { +define float @f2(<4 x float> %val1, <4 x float> %val2) #0 { ; CHECK-LABEL: f2: ; CHECK: wfmsb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,6 +28,8 @@ define float @f2(<4 x float> %val1, <4 x float> %val2) { %ret = call float @llvm.experimental.constrained.fmul.f32( float %scalar1, float %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-mul-04.ll b/llvm/test/CodeGen/SystemZ/vec-strict-mul-04.ll index e24c38959069bf..3a4b1448d4674a 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-mul-04.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-mul-04.ll @@ -6,7 +6,7 @@ declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x f ; Test a v4f32 multiply-and-add. define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2, <4 x float> %val3) { + <4 x float> %val2, <4 x float> %val3) #0 { ; CHECK-LABEL: f1: ; CHECK: vfmasb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -15,13 +15,13 @@ define <4 x float> @f1(<4 x float> %dummy, <4 x float> %val1, <4 x float> %val2, <4 x float> %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } ; Test a v4f32 multiply-and-subtract. define <4 x float> @f2(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2, <4 x float> %val3) { + <4 x float> %val2, <4 x float> %val3) #0 { ; CHECK-LABEL: f2: ; CHECK: vfmssb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -32,6 +32,8 @@ define <4 x float> @f2(<4 x float> %dummy, <4 x float> %val1, <4 x float> %val2, <4 x float> %negval3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-mul-05.ll b/llvm/test/CodeGen/SystemZ/vec-strict-mul-05.ll index 9fdefc505b0ced..b99378bc7b725e 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-mul-05.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-mul-05.ll @@ -7,7 +7,7 @@ declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x f ; Test a v2f64 negative multiply-and-add. define <2 x double> @f1(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2, <2 x double> %val3) { + <2 x double> %val2, <2 x double> %val3) #0 { ; CHECK-LABEL: f1: ; CHECK: vfnmadb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -16,14 +16,14 @@ define <2 x double> @f1(<2 x double> %dummy, <2 x double> %val1, <2 x double> %val2, <2 x double> %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negret = fsub <2 x double> , %ret ret <2 x double> %negret } ; Test a v2f64 negative multiply-and-subtract. define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2, <2 x double> %val3) { + <2 x double> %val2, <2 x double> %val3) #0 { ; CHECK-LABEL: f2: ; CHECK: vfnmsdb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -33,14 +33,14 @@ define <2 x double> @f2(<2 x double> %dummy, <2 x double> %val1, <2 x double> %val2, <2 x double> %negval3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negret = fsub <2 x double> , %ret ret <2 x double> %negret } ; Test a v4f32 negative multiply-and-add. define <4 x float> @f3(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2, <4 x float> %val3) { + <4 x float> %val2, <4 x float> %val3) #0 { ; CHECK-LABEL: f3: ; CHECK: vfnmasb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -49,7 +49,7 @@ define <4 x float> @f3(<4 x float> %dummy, <4 x float> %val1, <4 x float> %val2, <4 x float> %val3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negret = fsub <4 x float> , %ret ret <4 x float> %negret @@ -57,7 +57,7 @@ define <4 x float> @f3(<4 x float> %dummy, <4 x float> %val1, ; Test a v4f32 negative multiply-and-subtract. define <4 x float> @f4(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2, <4 x float> %val3) { + <4 x float> %val2, <4 x float> %val3) #0 { ; CHECK-LABEL: f4: ; CHECK: vfnmssb %v24, %v26, %v28, %v30 ; CHECK: br %r14 @@ -68,8 +68,10 @@ define <4 x float> @f4(<4 x float> %dummy, <4 x float> %val1, <4 x float> %val2, <4 x float> %negval3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %negret = fsub <4 x float> , %ret ret <4 x float> %negret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-round-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-round-01.ll index e86357bb139744..9f73c73c212bd8 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-round-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-round-01.ll @@ -15,73 +15,73 @@ declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, met declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata) -define <2 x double> @f1(<2 x double> %val) { +define <2 x double> @f1(<2 x double> %val) #0 { ; CHECK-LABEL: f1: ; CHECK: vfidb %v24, %v24, 0, 0 ; CHECK: br %r14 %res = call <2 x double> @llvm.experimental.constrained.rint.v2f64( <2 x double> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %res } -define <2 x double> @f2(<2 x double> %val) { +define <2 x double> @f2(<2 x double> %val) #0 { ; CHECK-LABEL: f2: ; CHECK: vfidb %v24, %v24, 4, 0 ; CHECK: br %r14 %res = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64( <2 x double> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %res } -define <2 x double> @f3(<2 x double> %val) { +define <2 x double> @f3(<2 x double> %val) #0 { ; CHECK-LABEL: f3: ; CHECK: vfidb %v24, %v24, 4, 7 ; CHECK: br %r14 %res = call <2 x double> @llvm.experimental.constrained.floor.v2f64( <2 x double> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %res } -define <2 x double> @f4(<2 x double> %val) { +define <2 x double> @f4(<2 x double> %val) #0 { ; CHECK-LABEL: f4: ; CHECK: vfidb %v24, %v24, 4, 6 ; CHECK: br %r14 %res = call <2 x double> @llvm.experimental.constrained.ceil.v2f64( <2 x double> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %res } -define <2 x double> @f5(<2 x double> %val) { +define <2 x double> @f5(<2 x double> %val) #0 { ; CHECK-LABEL: f5: ; CHECK: vfidb %v24, %v24, 4, 5 ; CHECK: br %r14 %res = call <2 x double> @llvm.experimental.constrained.trunc.v2f64( <2 x double> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %res } -define <2 x double> @f6(<2 x double> %val) { +define <2 x double> @f6(<2 x double> %val) #0 { ; CHECK-LABEL: f6: ; CHECK: vfidb %v24, %v24, 4, 1 ; CHECK: br %r14 %res = call <2 x double> @llvm.experimental.constrained.round.v2f64( <2 x double> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %res } -define double @f7(<2 x double> %val) { +define double @f7(<2 x double> %val) #0 { ; CHECK-LABEL: f7: ; CHECK: wfidb %f0, %v24, 0, 0 ; CHECK: br %r14 @@ -89,11 +89,11 @@ define double @f7(<2 x double> %val) { %res = call double @llvm.experimental.constrained.rint.f64( double %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f8(<2 x double> %val) { +define double @f8(<2 x double> %val) #0 { ; CHECK-LABEL: f8: ; CHECK: wfidb %f0, %v24, 4, 0 ; CHECK: br %r14 @@ -101,11 +101,11 @@ define double @f8(<2 x double> %val) { %res = call double @llvm.experimental.constrained.nearbyint.f64( double %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f9(<2 x double> %val) { +define double @f9(<2 x double> %val) #0 { ; CHECK-LABEL: f9: ; CHECK: wfidb %f0, %v24, 4, 7 ; CHECK: br %r14 @@ -113,12 +113,12 @@ define double @f9(<2 x double> %val) { %res = call double @llvm.experimental.constrained.floor.f64( double %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f10(<2 x double> %val) { +define double @f10(<2 x double> %val) #0 { ; CHECK-LABEL: f10: ; CHECK: wfidb %f0, %v24, 4, 6 ; CHECK: br %r14 @@ -126,11 +126,11 @@ define double @f10(<2 x double> %val) { %res = call double @llvm.experimental.constrained.ceil.f64( double %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f11(<2 x double> %val) { +define double @f11(<2 x double> %val) #0 { ; CHECK-LABEL: f11: ; CHECK: wfidb %f0, %v24, 4, 5 ; CHECK: br %r14 @@ -138,11 +138,11 @@ define double @f11(<2 x double> %val) { %res = call double @llvm.experimental.constrained.trunc.f64( double %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } -define double @f12(<2 x double> %val) { +define double @f12(<2 x double> %val) #0 { ; CHECK-LABEL: f12: ; CHECK: wfidb %f0, %v24, 4, 1 ; CHECK: br %r14 @@ -150,6 +150,8 @@ define double @f12(<2 x double> %val) { %res = call double @llvm.experimental.constrained.round.f64( double %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %res } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-round-02.ll b/llvm/test/CodeGen/SystemZ/vec-strict-round-02.ll index 2ee9b10ccdf081..9eec926f4cb2a2 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-round-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-round-02.ll @@ -15,73 +15,73 @@ declare <4 x float> @llvm.experimental.constrained.ceil.v4f32(<4 x float>, metad declare <4 x float> @llvm.experimental.constrained.trunc.v4f32(<4 x float>, metadata, metadata) declare <4 x float> @llvm.experimental.constrained.round.v4f32(<4 x float>, metadata, metadata) -define <4 x float> @f1(<4 x float> %val) { +define <4 x float> @f1(<4 x float> %val) #0 { ; CHECK-LABEL: f1: ; CHECK: vfisb %v24, %v24, 0, 0 ; CHECK: br %r14 %res = call <4 x float> @llvm.experimental.constrained.rint.v4f32( <4 x float> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %res } -define <4 x float> @f2(<4 x float> %val) { +define <4 x float> @f2(<4 x float> %val) #0 { ; CHECK-LABEL: f2: ; CHECK: vfisb %v24, %v24, 4, 0 ; CHECK: br %r14 %res = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32( <4 x float> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %res } -define <4 x float> @f3(<4 x float> %val) { +define <4 x float> @f3(<4 x float> %val) #0 { ; CHECK-LABEL: f3: ; CHECK: vfisb %v24, %v24, 4, 7 ; CHECK: br %r14 %res = call <4 x float> @llvm.experimental.constrained.floor.v4f32( <4 x float> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %res } -define <4 x float> @f4(<4 x float> %val) { +define <4 x float> @f4(<4 x float> %val) #0 { ; CHECK-LABEL: f4: ; CHECK: vfisb %v24, %v24, 4, 6 ; CHECK: br %r14 %res = call <4 x float> @llvm.experimental.constrained.ceil.v4f32( <4 x float> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %res } -define <4 x float> @f5(<4 x float> %val) { +define <4 x float> @f5(<4 x float> %val) #0 { ; CHECK-LABEL: f5: ; CHECK: vfisb %v24, %v24, 4, 5 ; CHECK: br %r14 %res = call <4 x float> @llvm.experimental.constrained.trunc.v4f32( <4 x float> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %res } -define <4 x float> @f6(<4 x float> %val) { +define <4 x float> @f6(<4 x float> %val) #0 { ; CHECK-LABEL: f6: ; CHECK: vfisb %v24, %v24, 4, 1 ; CHECK: br %r14 %res = call <4 x float> @llvm.experimental.constrained.round.v4f32( <4 x float> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %res } -define float @f7(<4 x float> %val) { +define float @f7(<4 x float> %val) #0 { ; CHECK-LABEL: f7: ; CHECK: wfisb %f0, %v24, 0, 0 ; CHECK: br %r14 @@ -89,11 +89,11 @@ define float @f7(<4 x float> %val) { %res = call float @llvm.experimental.constrained.rint.f32( float %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f8(<4 x float> %val) { +define float @f8(<4 x float> %val) #0 { ; CHECK-LABEL: f8: ; CHECK: wfisb %f0, %v24, 4, 0 ; CHECK: br %r14 @@ -101,11 +101,11 @@ define float @f8(<4 x float> %val) { %res = call float @llvm.experimental.constrained.nearbyint.f32( float %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f9(<4 x float> %val) { +define float @f9(<4 x float> %val) #0 { ; CHECK-LABEL: f9: ; CHECK: wfisb %f0, %v24, 4, 7 ; CHECK: br %r14 @@ -113,11 +113,11 @@ define float @f9(<4 x float> %val) { %res = call float @llvm.experimental.constrained.floor.f32( float %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f10(<4 x float> %val) { +define float @f10(<4 x float> %val) #0 { ; CHECK-LABEL: f10: ; CHECK: wfisb %f0, %v24, 4, 6 ; CHECK: br %r14 @@ -125,11 +125,11 @@ define float @f10(<4 x float> %val) { %res = call float @llvm.experimental.constrained.ceil.f32( float %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f11(<4 x float> %val) { +define float @f11(<4 x float> %val) #0 { ; CHECK-LABEL: f11: ; CHECK: wfisb %f0, %v24, 4, 5 ; CHECK: br %r14 @@ -137,11 +137,11 @@ define float @f11(<4 x float> %val) { %res = call float @llvm.experimental.constrained.trunc.f32( float %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } -define float @f12(<4 x float> %val) { +define float @f12(<4 x float> %val) #0 { ; CHECK-LABEL: f12: ; CHECK: wfisb %f0, %v24, 4, 1 ; CHECK: br %r14 @@ -149,6 +149,8 @@ define float @f12(<4 x float> %val) { %res = call float @llvm.experimental.constrained.round.f32( float %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %res } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-01.ll index f59558f11acc8a..f7fee09af3f80c 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-01.ll @@ -5,18 +5,18 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata) -define <2 x double> @f1(<2 x double> %val) { +define <2 x double> @f1(<2 x double> %val) #0 { ; CHECK-LABEL: f1: ; CHECK: vfsqdb %v24, %v24 ; CHECK: br %r14 %ret = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64( <2 x double> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } -define double @f2(<2 x double> %val) { +define double @f2(<2 x double> %val) #0 { ; CHECK-LABEL: f2: ; CHECK: wfsqdb %f0, %v24 ; CHECK: br %r14 @@ -24,6 +24,8 @@ define double @f2(<2 x double> %val) { %ret = call double @llvm.experimental.constrained.sqrt.f64( double %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-02.ll b/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-02.ll index 591c2c48cdb8eb..8c60bd3f68592d 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-sqrt-02.ll @@ -5,18 +5,18 @@ declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) declare <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float>, metadata, metadata) -define <4 x float> @f1(<4 x float> %val) { +define <4 x float> @f1(<4 x float> %val) #0 { ; CHECK-LABEL: f1: ; CHECK: vfsqsb %v24, %v24 ; CHECK: br %r14 %ret = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32( <4 x float> %val, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } -define float @f2(<4 x float> %val) { +define float @f2(<4 x float> %val) #0 { ; CHECK-LABEL: f2: ; CHECK: wfsqsb %f0, %v24 ; CHECK: br %r14 @@ -24,6 +24,8 @@ define float @f2(<4 x float> %val) { %ret = call float @llvm.experimental.constrained.sqrt.f32( float %scalar, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-sub-01.ll b/llvm/test/CodeGen/SystemZ/vec-strict-sub-01.ll index 8564d227529b30..a379613c389c8b 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-sub-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-sub-01.ll @@ -7,19 +7,19 @@ declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 ; Test a v2f64 subtraction. define <2 x double> @f6(<2 x double> %dummy, <2 x double> %val1, - <2 x double> %val2) { + <2 x double> %val2) #0 { ; CHECK-LABEL: f6: ; CHECK: vfsdb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <2 x double> @llvm.experimental.constrained.fsub.v2f64( <2 x double> %val1, <2 x double> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ret } ; Test an f64 subtraction that uses vector registers. -define double @f7(<2 x double> %val1, <2 x double> %val2) { +define double @f7(<2 x double> %val1, <2 x double> %val2) #0 { ; CHECK-LABEL: f7: ; CHECK: wfsdb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,7 +28,8 @@ define double @f7(<2 x double> %val1, <2 x double> %val2) { %ret = call double @llvm.experimental.constrained.fsub.f64( double %scalar1, double %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vec-strict-sub-02.ll b/llvm/test/CodeGen/SystemZ/vec-strict-sub-02.ll index 1843678d23cad8..fc93e6a0918555 100644 --- a/llvm/test/CodeGen/SystemZ/vec-strict-sub-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-strict-sub-02.ll @@ -7,19 +7,19 @@ declare <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float>, <4 x ; Test a v4f32 subtraction. define <4 x float> @f6(<4 x float> %dummy, <4 x float> %val1, - <4 x float> %val2) { + <4 x float> %val2) #0 { ; CHECK-LABEL: f6: ; CHECK: vfssb %v24, %v26, %v28 ; CHECK: br %r14 %ret = call <4 x float> @llvm.experimental.constrained.fsub.v4f32( <4 x float> %val1, <4 x float> %val2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %ret } ; Test an f32 subtraction that uses vector registers. -define float @f7(<4 x float> %val1, <4 x float> %val2) { +define float @f7(<4 x float> %val1, <4 x float> %val2) #0 { ; CHECK-LABEL: f7: ; CHECK: wfssb %f0, %v24, %v26 ; CHECK: br %r14 @@ -28,6 +28,8 @@ define float @f7(<4 x float> %val1, <4 x float> %val2) { %ret = call float @llvm.experimental.constrained.fsub.f32( float %scalar1, float %scalar2, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %ret } + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll index 8ab4c6db255bab..64097eea38ff75 100644 --- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -2,7 +2,7 @@ ; RUN: llc -O3 -mtriple=s390x-linux-gnu < %s | FileCheck --check-prefix=S390X %s ; RUN: llc -O3 -mtriple=s390x-linux-gnu -mcpu=z13 < %s | FileCheck --check-prefix=SZ13 %s -define <1 x float> @constrained_vector_fdiv_v1f32() { +define <1 x float> @constrained_vector_fdiv_v1f32() #0 { ; S390X-LABEL: constrained_vector_fdiv_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI0_0 @@ -23,11 +23,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %div } -define <2 x double> @constrained_vector_fdiv_v2f64() { +define <2 x double> @constrained_vector_fdiv_v2f64() #0 { ; S390X-LABEL: constrained_vector_fdiv_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI1_0 @@ -53,11 +53,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %div } -define <3 x float> @constrained_vector_fdiv_v3f32() { +define <3 x float> @constrained_vector_fdiv_v3f32() #0 { ; S390X-LABEL: constrained_vector_fdiv_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI2_0 @@ -93,11 +93,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %div } -define void @constrained_vector_fdiv_v3f64(<3 x double>* %a) { +define void @constrained_vector_fdiv_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_fdiv_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: ld %f0, 16(%r2) @@ -134,12 +134,12 @@ entry: <3 x double> , <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %div, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_fdiv_v4f64() { +define <4 x double> @constrained_vector_fdiv_v4f64() #0 { ; S390X-LABEL: constrained_vector_fdiv_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI4_0 @@ -176,11 +176,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %div } -define <1 x float> @constrained_vector_frem_v1f32() { +define <1 x float> @constrained_vector_frem_v1f32() #0 { ; S390X-LABEL: constrained_vector_frem_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -216,11 +216,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %rem } -define <2 x double> @constrained_vector_frem_v2f64() { +define <2 x double> @constrained_vector_frem_v2f64() #0 { ; S390X-LABEL: constrained_vector_frem_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -281,11 +281,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %rem } -define <3 x float> @constrained_vector_frem_v3f32() { +define <3 x float> @constrained_vector_frem_v3f32() #0 { ; S390X-LABEL: constrained_vector_frem_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -365,11 +365,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %rem } -define void @constrained_vector_frem_v3f64(<3 x double>* %a) { +define void @constrained_vector_frem_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_frem_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -456,12 +456,12 @@ entry: <3 x double> , <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %rem, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_frem_v4f64() { +define <4 x double> @constrained_vector_frem_v4f64() #0 { ; S390X-LABEL: constrained_vector_frem_v4f64: ; S390X: # %bb.0: ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -558,11 +558,11 @@ define <4 x double> @constrained_vector_frem_v4f64() { <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %rem } -define <1 x float> @constrained_vector_fmul_v1f32() { +define <1 x float> @constrained_vector_fmul_v1f32() #0 { ; S390X-LABEL: constrained_vector_fmul_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI10_0 @@ -583,11 +583,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %mul } -define <2 x double> @constrained_vector_fmul_v2f64() { +define <2 x double> @constrained_vector_fmul_v2f64() #0 { ; S390X-LABEL: constrained_vector_fmul_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI11_0 @@ -613,11 +613,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %mul } -define <3 x float> @constrained_vector_fmul_v3f32() { +define <3 x float> @constrained_vector_fmul_v3f32() #0 { ; S390X-LABEL: constrained_vector_fmul_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI12_0 @@ -652,11 +652,11 @@ entry: float 0x7FF0000000000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %mul } -define void @constrained_vector_fmul_v3f64(<3 x double>* %a) { +define void @constrained_vector_fmul_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_fmul_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: ld %f0, 8(%r2) @@ -691,12 +691,12 @@ entry: double 0x7FEFFFFFFFFFFFFF>, <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %mul, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_fmul_v4f64() { +define <4 x double> @constrained_vector_fmul_v4f64() #0 { ; S390X-LABEL: constrained_vector_fmul_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI14_0 @@ -733,11 +733,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %mul } -define <1 x float> @constrained_vector_fadd_v1f32() { +define <1 x float> @constrained_vector_fadd_v1f32() #0 { ; S390X-LABEL: constrained_vector_fadd_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI15_0 @@ -758,11 +758,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %add } -define <2 x double> @constrained_vector_fadd_v2f64() { +define <2 x double> @constrained_vector_fadd_v2f64() #0 { ; S390X-LABEL: constrained_vector_fadd_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI16_0 @@ -788,11 +788,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %add } -define <3 x float> @constrained_vector_fadd_v3f32() { +define <3 x float> @constrained_vector_fadd_v3f32() #0 { ; S390X-LABEL: constrained_vector_fadd_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI17_0 @@ -825,11 +825,11 @@ entry: float 0xFFFFFFFFE0000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %add } -define void @constrained_vector_fadd_v3f64(<3 x double>* %a) { +define void @constrained_vector_fadd_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_fadd_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: ld %f0, 8(%r2) @@ -864,12 +864,12 @@ entry: double 0x7FEFFFFFFFFFFFFF>, <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %add, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_fadd_v4f64() { +define <4 x double> @constrained_vector_fadd_v4f64() #0 { ; S390X-LABEL: constrained_vector_fadd_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI19_0 @@ -906,11 +906,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %add } -define <1 x float> @constrained_vector_fsub_v1f32() { +define <1 x float> @constrained_vector_fsub_v1f32() #0 { ; S390X-LABEL: constrained_vector_fsub_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI20_0 @@ -931,11 +931,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %sub } -define <2 x double> @constrained_vector_fsub_v2f64() { +define <2 x double> @constrained_vector_fsub_v2f64() #0 { ; S390X-LABEL: constrained_vector_fsub_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI21_0 @@ -960,11 +960,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %sub } -define <3 x float> @constrained_vector_fsub_v3f32() { +define <3 x float> @constrained_vector_fsub_v3f32() #0 { ; S390X-LABEL: constrained_vector_fsub_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI22_0 @@ -1000,11 +1000,11 @@ entry: float 0xFFFFFFFFE0000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %sub } -define void @constrained_vector_fsub_v3f64(<3 x double>* %a) { +define void @constrained_vector_fsub_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_fsub_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI23_0 @@ -1038,12 +1038,12 @@ entry: double 0xFFEFFFFFFFFFFFFF>, <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %sub, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_fsub_v4f64() { +define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; S390X-LABEL: constrained_vector_fsub_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI24_0 @@ -1080,11 +1080,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %sub } -define <1 x float> @constrained_vector_sqrt_v1f32() { +define <1 x float> @constrained_vector_sqrt_v1f32() #0 { ; S390X-LABEL: constrained_vector_sqrt_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI25_0 @@ -1101,11 +1101,11 @@ entry: %sqrt = call <1 x float> @llvm.experimental.constrained.sqrt.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %sqrt } -define <2 x double> @constrained_vector_sqrt_v2f64() { +define <2 x double> @constrained_vector_sqrt_v2f64() #0 { ; S390X-LABEL: constrained_vector_sqrt_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI26_0 @@ -1125,11 +1125,11 @@ entry: %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %sqrt } -define <3 x float> @constrained_vector_sqrt_v3f32() { +define <3 x float> @constrained_vector_sqrt_v3f32() #0 { ; S390X-LABEL: constrained_vector_sqrt_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI27_0 @@ -1156,11 +1156,11 @@ entry: %sqrt = call <3 x float> @llvm.experimental.constrained.sqrt.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %sqrt } -define void @constrained_vector_sqrt_v3f64(<3 x double>* %a) { +define void @constrained_vector_sqrt_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_sqrt_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: ld %f0, 8(%r2) @@ -1186,12 +1186,12 @@ entry: %sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %sqrt, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_sqrt_v4f64() { +define <4 x double> @constrained_vector_sqrt_v4f64() #0 { ; S390X-LABEL: constrained_vector_sqrt_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI29_0 @@ -1219,11 +1219,11 @@ define <4 x double> @constrained_vector_sqrt_v4f64() { <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %sqrt } -define <1 x float> @constrained_vector_pow_v1f32() { +define <1 x float> @constrained_vector_pow_v1f32() #0 { ; S390X-LABEL: constrained_vector_pow_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1260,11 +1260,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %pow } -define <2 x double> @constrained_vector_pow_v2f64() { +define <2 x double> @constrained_vector_pow_v2f64() #0 { ; S390X-LABEL: constrained_vector_pow_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1327,11 +1327,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %pow } -define <3 x float> @constrained_vector_pow_v3f32() { +define <3 x float> @constrained_vector_pow_v3f32() #0 { ; S390X-LABEL: constrained_vector_pow_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1413,11 +1413,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %pow } -define void @constrained_vector_pow_v3f64(<3 x double>* %a) { +define void @constrained_vector_pow_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_pow_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -1508,12 +1508,12 @@ entry: <3 x double> %b, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %pow, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_pow_v4f64() { +define <4 x double> @constrained_vector_pow_v4f64() #0 { ; S390X-LABEL: constrained_vector_pow_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1613,11 +1613,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %pow } -define <1 x float> @constrained_vector_powi_v1f32() { +define <1 x float> @constrained_vector_powi_v1f32() #0 { ; S390X-LABEL: constrained_vector_powi_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1652,11 +1652,11 @@ entry: <1 x float> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %powi } -define <2 x double> @constrained_vector_powi_v2f64() { +define <2 x double> @constrained_vector_powi_v2f64() #0 { ; S390X-LABEL: constrained_vector_powi_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1709,11 +1709,11 @@ entry: <2 x double> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %powi } -define <3 x float> @constrained_vector_powi_v3f32() { +define <3 x float> @constrained_vector_powi_v3f32() #0 { ; S390X-LABEL: constrained_vector_powi_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1785,11 +1785,11 @@ entry: <3 x float> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %powi } -define void @constrained_vector_powi_v3f64(<3 x double>* %a) { +define void @constrained_vector_powi_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_powi_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -1865,12 +1865,12 @@ entry: <3 x double> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %powi, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_powi_v4f64() { +define <4 x double> @constrained_vector_powi_v4f64() #0 { ; S390X-LABEL: constrained_vector_powi_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1959,11 +1959,11 @@ entry: double 42.3, double 42.4>, i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %powi } -define <1 x float> @constrained_vector_sin_v1f32() { +define <1 x float> @constrained_vector_sin_v1f32() #0 { ; S390X-LABEL: constrained_vector_sin_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -1995,11 +1995,11 @@ entry: %sin = call <1 x float> @llvm.experimental.constrained.sin.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %sin } -define <2 x double> @constrained_vector_sin_v2f64() { +define <2 x double> @constrained_vector_sin_v2f64() #0 { ; S390X-LABEL: constrained_vector_sin_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2047,11 +2047,11 @@ entry: %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %sin } -define <3 x float> @constrained_vector_sin_v3f32() { +define <3 x float> @constrained_vector_sin_v3f32() #0 { ; S390X-LABEL: constrained_vector_sin_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2116,11 +2116,11 @@ entry: %sin = call <3 x float> @llvm.experimental.constrained.sin.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %sin } -define void @constrained_vector_sin_v3f64(<3 x double>* %a) { +define void @constrained_vector_sin_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_sin_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -2194,12 +2194,12 @@ entry: %sin = call <3 x double> @llvm.experimental.constrained.sin.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %sin, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_sin_v4f64() { +define <4 x double> @constrained_vector_sin_v4f64() #0 { ; S390X-LABEL: constrained_vector_sin_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2279,11 +2279,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %sin } -define <1 x float> @constrained_vector_cos_v1f32() { +define <1 x float> @constrained_vector_cos_v1f32() #0 { ; S390X-LABEL: constrained_vector_cos_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2315,11 +2315,11 @@ entry: %cos = call <1 x float> @llvm.experimental.constrained.cos.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %cos } -define <2 x double> @constrained_vector_cos_v2f64() { +define <2 x double> @constrained_vector_cos_v2f64() #0 { ; S390X-LABEL: constrained_vector_cos_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2367,11 +2367,11 @@ entry: %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %cos } -define <3 x float> @constrained_vector_cos_v3f32() { +define <3 x float> @constrained_vector_cos_v3f32() #0 { ; S390X-LABEL: constrained_vector_cos_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2436,11 +2436,11 @@ entry: %cos = call <3 x float> @llvm.experimental.constrained.cos.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %cos } -define void @constrained_vector_cos_v3f64(<3 x double>* %a) { +define void @constrained_vector_cos_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_cos_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -2514,12 +2514,12 @@ entry: %cos = call <3 x double> @llvm.experimental.constrained.cos.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %cos, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_cos_v4f64() { +define <4 x double> @constrained_vector_cos_v4f64() #0 { ; S390X-LABEL: constrained_vector_cos_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2599,11 +2599,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %cos } -define <1 x float> @constrained_vector_exp_v1f32() { +define <1 x float> @constrained_vector_exp_v1f32() #0 { ; S390X-LABEL: constrained_vector_exp_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2635,11 +2635,11 @@ entry: %exp = call <1 x float> @llvm.experimental.constrained.exp.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %exp } -define <2 x double> @constrained_vector_exp_v2f64() { +define <2 x double> @constrained_vector_exp_v2f64() #0 { ; S390X-LABEL: constrained_vector_exp_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2687,11 +2687,11 @@ entry: %exp = call <2 x double> @llvm.experimental.constrained.exp.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %exp } -define <3 x float> @constrained_vector_exp_v3f32() { +define <3 x float> @constrained_vector_exp_v3f32() #0 { ; S390X-LABEL: constrained_vector_exp_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2756,11 +2756,11 @@ entry: %exp = call <3 x float> @llvm.experimental.constrained.exp.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %exp } -define void @constrained_vector_exp_v3f64(<3 x double>* %a) { +define void @constrained_vector_exp_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_exp_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -2834,12 +2834,12 @@ entry: %exp = call <3 x double> @llvm.experimental.constrained.exp.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %exp, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_exp_v4f64() { +define <4 x double> @constrained_vector_exp_v4f64() #0 { ; S390X-LABEL: constrained_vector_exp_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2919,11 +2919,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %exp } -define <1 x float> @constrained_vector_exp2_v1f32() { +define <1 x float> @constrained_vector_exp2_v1f32() #0 { ; S390X-LABEL: constrained_vector_exp2_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -2955,11 +2955,11 @@ entry: %exp2 = call <1 x float> @llvm.experimental.constrained.exp2.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %exp2 } -define <2 x double> @constrained_vector_exp2_v2f64() { +define <2 x double> @constrained_vector_exp2_v2f64() #0 { ; S390X-LABEL: constrained_vector_exp2_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3007,11 +3007,11 @@ entry: %exp2 = call <2 x double> @llvm.experimental.constrained.exp2.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %exp2 } -define <3 x float> @constrained_vector_exp2_v3f32() { +define <3 x float> @constrained_vector_exp2_v3f32() #0 { ; S390X-LABEL: constrained_vector_exp2_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3076,11 +3076,11 @@ entry: %exp2 = call <3 x float> @llvm.experimental.constrained.exp2.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %exp2 } -define void @constrained_vector_exp2_v3f64(<3 x double>* %a) { +define void @constrained_vector_exp2_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_exp2_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -3154,12 +3154,12 @@ entry: %exp2 = call <3 x double> @llvm.experimental.constrained.exp2.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %exp2, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_exp2_v4f64() { +define <4 x double> @constrained_vector_exp2_v4f64() #0 { ; S390X-LABEL: constrained_vector_exp2_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3239,11 +3239,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %exp2 } -define <1 x float> @constrained_vector_log_v1f32() { +define <1 x float> @constrained_vector_log_v1f32() #0 { ; S390X-LABEL: constrained_vector_log_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3275,11 +3275,11 @@ entry: %log = call <1 x float> @llvm.experimental.constrained.log.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %log } -define <2 x double> @constrained_vector_log_v2f64() { +define <2 x double> @constrained_vector_log_v2f64() #0 { ; S390X-LABEL: constrained_vector_log_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3327,11 +3327,11 @@ entry: %log = call <2 x double> @llvm.experimental.constrained.log.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %log } -define <3 x float> @constrained_vector_log_v3f32() { +define <3 x float> @constrained_vector_log_v3f32() #0 { ; S390X-LABEL: constrained_vector_log_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3396,11 +3396,11 @@ entry: %log = call <3 x float> @llvm.experimental.constrained.log.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %log } -define void @constrained_vector_log_v3f64(<3 x double>* %a) { +define void @constrained_vector_log_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_log_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -3474,12 +3474,12 @@ entry: %log = call <3 x double> @llvm.experimental.constrained.log.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %log, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_log_v4f64() { +define <4 x double> @constrained_vector_log_v4f64() #0 { ; S390X-LABEL: constrained_vector_log_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3559,11 +3559,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %log } -define <1 x float> @constrained_vector_log10_v1f32() { +define <1 x float> @constrained_vector_log10_v1f32() #0 { ; S390X-LABEL: constrained_vector_log10_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3595,11 +3595,11 @@ entry: %log10 = call <1 x float> @llvm.experimental.constrained.log10.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %log10 } -define <2 x double> @constrained_vector_log10_v2f64() { +define <2 x double> @constrained_vector_log10_v2f64() #0 { ; S390X-LABEL: constrained_vector_log10_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3647,11 +3647,11 @@ entry: %log10 = call <2 x double> @llvm.experimental.constrained.log10.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %log10 } -define <3 x float> @constrained_vector_log10_v3f32() { +define <3 x float> @constrained_vector_log10_v3f32() #0 { ; S390X-LABEL: constrained_vector_log10_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3716,11 +3716,11 @@ entry: %log10 = call <3 x float> @llvm.experimental.constrained.log10.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %log10 } -define void @constrained_vector_log10_v3f64(<3 x double>* %a) { +define void @constrained_vector_log10_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_log10_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -3794,12 +3794,12 @@ entry: %log10 = call <3 x double> @llvm.experimental.constrained.log10.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %log10, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_log10_v4f64() { +define <4 x double> @constrained_vector_log10_v4f64() #0 { ; S390X-LABEL: constrained_vector_log10_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3879,11 +3879,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %log10 } -define <1 x float> @constrained_vector_log2_v1f32() { +define <1 x float> @constrained_vector_log2_v1f32() #0 { ; S390X-LABEL: constrained_vector_log2_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3915,11 +3915,11 @@ entry: %log2 = call <1 x float> @llvm.experimental.constrained.log2.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %log2 } -define <2 x double> @constrained_vector_log2_v2f64() { +define <2 x double> @constrained_vector_log2_v2f64() #0 { ; S390X-LABEL: constrained_vector_log2_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -3967,11 +3967,11 @@ entry: %log2 = call <2 x double> @llvm.experimental.constrained.log2.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %log2 } -define <3 x float> @constrained_vector_log2_v3f32() { +define <3 x float> @constrained_vector_log2_v3f32() #0 { ; S390X-LABEL: constrained_vector_log2_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4036,11 +4036,11 @@ entry: %log2 = call <3 x float> @llvm.experimental.constrained.log2.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %log2 } -define void @constrained_vector_log2_v3f64(<3 x double>* %a) { +define void @constrained_vector_log2_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_log2_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -4114,12 +4114,12 @@ entry: %log2 = call <3 x double> @llvm.experimental.constrained.log2.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %log2, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_log2_v4f64() { +define <4 x double> @constrained_vector_log2_v4f64() #0 { ; S390X-LABEL: constrained_vector_log2_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4199,11 +4199,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %log2 } -define <1 x float> @constrained_vector_rint_v1f32() { +define <1 x float> @constrained_vector_rint_v1f32() #0 { ; S390X-LABEL: constrained_vector_rint_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI75_0 @@ -4222,11 +4222,11 @@ entry: %rint = call <1 x float> @llvm.experimental.constrained.rint.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %rint } -define <2 x double> @constrained_vector_rint_v2f64() { +define <2 x double> @constrained_vector_rint_v2f64() #0 { ; S390X-LABEL: constrained_vector_rint_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI76_0 @@ -4247,11 +4247,11 @@ entry: %rint = call <2 x double> @llvm.experimental.constrained.rint.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %rint } -define <3 x float> @constrained_vector_rint_v3f32() { +define <3 x float> @constrained_vector_rint_v3f32() #0 { ; S390X-LABEL: constrained_vector_rint_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI77_0 @@ -4284,11 +4284,11 @@ define <3 x float> @constrained_vector_rint_v3f32() { %rint = call <3 x float> @llvm.experimental.constrained.rint.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %rint } -define void @constrained_vector_rint_v3f64(<3 x double>* %a) { +define void @constrained_vector_rint_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_rint_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: ld %f0, 0(%r2) @@ -4316,12 +4316,12 @@ entry: %rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %rint, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_rint_v4f64() { +define <4 x double> @constrained_vector_rint_v4f64() #0 { ; S390X-LABEL: constrained_vector_rint_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI79_0 @@ -4352,11 +4352,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %rint } -define <1 x float> @constrained_vector_nearbyint_v1f32() { +define <1 x float> @constrained_vector_nearbyint_v1f32() #0 { ; S390X-LABEL: constrained_vector_nearbyint_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4381,11 +4381,11 @@ entry: %nearby = call <1 x float> @llvm.experimental.constrained.nearbyint.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %nearby } -define <2 x double> @constrained_vector_nearbyint_v2f64() { +define <2 x double> @constrained_vector_nearbyint_v2f64() #0 { ; S390X-LABEL: constrained_vector_nearbyint_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4419,11 +4419,11 @@ entry: %nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %nearby } -define <3 x float> @constrained_vector_nearbyint_v3f32() { +define <3 x float> @constrained_vector_nearbyint_v3f32() #0 { ; S390X-LABEL: constrained_vector_nearbyint_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4475,11 +4475,11 @@ entry: %nearby = call <3 x float> @llvm.experimental.constrained.nearbyint.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %nearby } -define void @constrained_vector_nearbyint_v3f64(<3 x double>* %a) { +define void @constrained_vector_nearbyint_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_nearbyint_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -4528,12 +4528,12 @@ entry: %nearby = call <3 x double> @llvm.experimental.constrained.nearbyint.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %nearby, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_nearbyint_v4f64() { +define <4 x double> @constrained_vector_nearbyint_v4f64() #0 { ; S390X-LABEL: constrained_vector_nearbyint_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4589,11 +4589,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %nearby } -define <1 x float> @constrained_vector_maxnum_v1f32() { +define <1 x float> @constrained_vector_maxnum_v1f32() #0 { ; S390X-LABEL: constrained_vector_maxnum_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4629,11 +4629,11 @@ entry: %max = call <1 x float> @llvm.experimental.constrained.maxnum.v1f32( <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %max } -define <2 x double> @constrained_vector_maxnum_v2f64() { +define <2 x double> @constrained_vector_maxnum_v2f64() #0 { ; S390X-LABEL: constrained_vector_maxnum_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4690,11 +4690,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %max } -define <3 x float> @constrained_vector_maxnum_v3f32() { +define <3 x float> @constrained_vector_maxnum_v3f32() #0 { ; S390X-LABEL: constrained_vector_maxnum_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4778,11 +4778,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %max } -define void @constrained_vector_log10_maxnum_v3f64(<3 x double>* %a) { +define void @constrained_vector_log10_maxnum_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_log10_maxnum_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -4869,12 +4869,12 @@ entry: <3 x double> %b, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %max, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_maxnum_v4f64() { +define <4 x double> @constrained_vector_maxnum_v4f64() #0 { ; S390X-LABEL: constrained_vector_maxnum_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -4972,11 +4972,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %max } -define <1 x float> @constrained_vector_minnum_v1f32() { +define <1 x float> @constrained_vector_minnum_v1f32() #0 { ; S390X-LABEL: constrained_vector_minnum_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5012,11 +5012,11 @@ define <1 x float> @constrained_vector_minnum_v1f32() { %min = call <1 x float> @llvm.experimental.constrained.minnum.v1f32( <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %min } -define <2 x double> @constrained_vector_minnum_v2f64() { +define <2 x double> @constrained_vector_minnum_v2f64() #0 { ; S390X-LABEL: constrained_vector_minnum_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5073,11 +5073,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %min } -define <3 x float> @constrained_vector_minnum_v3f32() { +define <3 x float> @constrained_vector_minnum_v3f32() #0 { ; S390X-LABEL: constrained_vector_minnum_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5161,11 +5161,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %min } -define void @constrained_vector_minnum_v3f64(<3 x double>* %a) { +define void @constrained_vector_minnum_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_minnum_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -5256,12 +5256,12 @@ entry: <3 x double> %b, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %min, <3 x double>* %a ret void } -define <4 x double> @constrained_vector_minnum_v4f64() { +define <4 x double> @constrained_vector_minnum_v4f64() #0 { ; S390X-LABEL: constrained_vector_minnum_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5359,11 +5359,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %min } -define <1 x float> @constrained_vector_fptrunc_v1f64() { +define <1 x float> @constrained_vector_fptrunc_v1f64() #0 { ; S390X-LABEL: constrained_vector_fptrunc_v1f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI95_0 @@ -5381,11 +5381,11 @@ entry: %result = call <1 x float> @llvm.experimental.constrained.fptrunc.v1f32.v1f64( <1 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %result } -define <2 x float> @constrained_vector_fptrunc_v2f64() { +define <2 x float> @constrained_vector_fptrunc_v2f64() #0 { ; S390X-LABEL: constrained_vector_fptrunc_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI96_0 @@ -5411,11 +5411,11 @@ entry: %result = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64( <2 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x float> %result } -define void @constrained_vector_fptrunc_v3f64(<3 x double>* %src, <3 x float>* %dest) { +define void @constrained_vector_fptrunc_v3f64(<3 x double>* %src, <3 x float>* %dest) #0 { ; S390X-LABEL: constrained_vector_fptrunc_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: ld %f0, 0(%r2) @@ -5451,12 +5451,12 @@ entry: %result = call <3 x float> @llvm.experimental.constrained.fptrunc.v3f32.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x float> %result, <3 x float>* %dest ret void } -define <4 x float> @constrained_vector_fptrunc_v4f64() { +define <4 x float> @constrained_vector_fptrunc_v4f64() #0 { ; S390X-LABEL: constrained_vector_fptrunc_v4f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI98_0 @@ -5496,11 +5496,11 @@ entry: <4 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %result } -define <1 x double> @constrained_vector_fpext_v1f32() { +define <1 x double> @constrained_vector_fpext_v1f32() #0 { ; S390X-LABEL: constrained_vector_fpext_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI99_0 @@ -5516,11 +5516,11 @@ define <1 x double> @constrained_vector_fpext_v1f32() { entry: %result = call <1 x double> @llvm.experimental.constrained.fpext.v1f64.v1f32( <1 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x double> %result } -define <2 x double> @constrained_vector_fpext_v2f32() { +define <2 x double> @constrained_vector_fpext_v2f32() #0 { ; S390X-LABEL: constrained_vector_fpext_v2f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI100_0 @@ -5540,11 +5540,11 @@ define <2 x double> @constrained_vector_fpext_v2f32() { entry: %result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32( <2 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %result } -define void @constrained_vector_fpext_v3f64(<3 x float>* %src, <3 x double>* %dest) { +define void @constrained_vector_fpext_v3f64(<3 x float>* %src, <3 x double>* %dest) #0 { ; S390X-LABEL: constrained_vector_fpext_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: lg %r0, 0(%r2) @@ -5576,12 +5576,12 @@ entry: %b = load <3 x float>, <3 x float>* %src %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32( <3 x float> %b, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %result, <3 x double>* %dest ret void } -define <4 x double> @constrained_vector_fpext_v4f32() { +define <4 x double> @constrained_vector_fpext_v4f32() #0 { ; S390X-LABEL: constrained_vector_fpext_v4f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: larl %r1, .LCPI102_0 @@ -5611,11 +5611,11 @@ entry: %result = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32( <4 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %result } -define <1 x float> @constrained_vector_ceil_v1f32() { +define <1 x float> @constrained_vector_ceil_v1f32() #0 { ; S390X-LABEL: constrained_vector_ceil_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5639,11 +5639,11 @@ entry: %ceil = call <1 x float> @llvm.experimental.constrained.ceil.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %ceil } -define <2 x double> @constrained_vector_ceil_v2f64() { +define <2 x double> @constrained_vector_ceil_v2f64() #0 { ; S390X-LABEL: constrained_vector_ceil_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5677,11 +5677,11 @@ entry: %ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ceil } -define <3 x float> @constrained_vector_ceil_v3f32() { +define <3 x float> @constrained_vector_ceil_v3f32() #0 { ; S390X-LABEL: constrained_vector_ceil_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5732,11 +5732,11 @@ entry: %ceil = call <3 x float> @llvm.experimental.constrained.ceil.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %ceil } -define void @constrained_vector_ceil_v3f64(<3 x double>* %a) { +define void @constrained_vector_ceil_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_ceil_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -5785,12 +5785,12 @@ entry: %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %ceil, <3 x double>* %a ret void } -define <1 x float> @constrained_vector_floor_v1f32() { +define <1 x float> @constrained_vector_floor_v1f32() #0 { ; S390X-LABEL: constrained_vector_floor_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5814,12 +5814,12 @@ entry: %floor = call <1 x float> @llvm.experimental.constrained.floor.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %floor } -define <2 x double> @constrained_vector_floor_v2f64() { +define <2 x double> @constrained_vector_floor_v2f64() #0 { ; S390X-LABEL: constrained_vector_floor_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5853,11 +5853,11 @@ entry: %floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %floor } -define <3 x float> @constrained_vector_floor_v3f32() { +define <3 x float> @constrained_vector_floor_v3f32() #0 { ; S390X-LABEL: constrained_vector_floor_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5908,11 +5908,11 @@ entry: %floor = call <3 x float> @llvm.experimental.constrained.floor.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %floor } -define void @constrained_vector_floor_v3f64(<3 x double>* %a) { +define void @constrained_vector_floor_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_floor_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -5961,12 +5961,12 @@ entry: %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %floor, <3 x double>* %a ret void } -define <1 x float> @constrained_vector_round_v1f32() { +define <1 x float> @constrained_vector_round_v1f32() #0 { ; S390X-LABEL: constrained_vector_round_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -5990,11 +5990,11 @@ entry: %round = call <1 x float> @llvm.experimental.constrained.round.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %round } -define <2 x double> @constrained_vector_round_v2f64() { +define <2 x double> @constrained_vector_round_v2f64() #0 { ; S390X-LABEL: constrained_vector_round_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -6028,11 +6028,11 @@ entry: %round = call <2 x double> @llvm.experimental.constrained.round.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %round } -define <3 x float> @constrained_vector_round_v3f32() { +define <3 x float> @constrained_vector_round_v3f32() #0 { ; S390X-LABEL: constrained_vector_round_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -6083,12 +6083,12 @@ entry: %round = call <3 x float> @llvm.experimental.constrained.round.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %round } -define void @constrained_vector_round_v3f64(<3 x double>* %a) { +define void @constrained_vector_round_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_round_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -6137,12 +6137,12 @@ entry: %round = call <3 x double> @llvm.experimental.constrained.round.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %round, <3 x double>* %a ret void } -define <1 x float> @constrained_vector_trunc_v1f32() { +define <1 x float> @constrained_vector_trunc_v1f32() #0 { ; S390X-LABEL: constrained_vector_trunc_v1f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -6166,11 +6166,11 @@ entry: %trunc = call <1 x float> @llvm.experimental.constrained.trunc.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %trunc } -define <2 x double> @constrained_vector_trunc_v2f64() { +define <2 x double> @constrained_vector_trunc_v2f64() #0 { ; S390X-LABEL: constrained_vector_trunc_v2f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -6204,11 +6204,11 @@ entry: %trunc = call <2 x double> @llvm.experimental.constrained.trunc.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %trunc } -define <3 x float> @constrained_vector_trunc_v3f32() { +define <3 x float> @constrained_vector_trunc_v3f32() #0 { ; S390X-LABEL: constrained_vector_trunc_v3f32: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r14, %r15, 112(%r15) @@ -6259,11 +6259,11 @@ entry: %trunc = call <3 x float> @llvm.experimental.constrained.trunc.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %trunc } -define void @constrained_vector_trunc_v3f64(<3 x double>* %a) { +define void @constrained_vector_trunc_v3f64(<3 x double>* %a) #0 { ; S390X-LABEL: constrained_vector_trunc_v3f64: ; S390X: # %bb.0: # %entry ; S390X-NEXT: stmg %r13, %r15, 104(%r15) @@ -6312,11 +6312,13 @@ entry: %trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64( <3 x double> %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 store <3 x double> %trunc, <3 x double>* %a ret void } +attributes #0 = { strictfp } + declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata) diff --git a/llvm/test/CodeGen/Thumb/long.ll b/llvm/test/CodeGen/Thumb/long.ll index fbf4b08fd06af2..856196af71f55f 100644 --- a/llvm/test/CodeGen/Thumb/long.ll +++ b/llvm/test/CodeGen/Thumb/long.ll @@ -234,7 +234,7 @@ if.end: %c = add i64 %y, 47 call void @f13(i64 %c) ; CHECK: adds -; CHECK-NEXT: adcs +; CHECK: adcs ; CHECK: bl ret void } diff --git a/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll b/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll new file mode 100644 index 00000000000000..31e54c43c1e5fd --- /dev/null +++ b/llvm/test/CodeGen/Thumb/scheduler-clone-cpsr-def.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv6-linux-gnueabi < %s | FileCheck %s + +; After various DAGCombine optimizations, we end up with an sbcs with +; multiple uses of the cpsr def, and we therefore clone the subs/sbcs. +; Make sure this doesn't crash. +; +; The output here might change at some point in the future, and no +; longer clone the operations; if that happens, there probably isn't any +; straightforward way to fix the test. +define i64 @f(i64 %x2, i32 %z) { +; CHECK-LABEL: f: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: subs r3, r0, #1 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: sbcs r3, r2 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: adcs r3, r2 +; CHECK-NEXT: movs r4, #30 +; CHECK-NEXT: subs r5, r0, #1 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: sbcs r5, r2 +; CHECK-NEXT: adcs r4, r2 +; CHECK-NEXT: lsls r2, r1, #1 +; CHECK-NEXT: lsls r2, r4 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: eors r4, r3 +; CHECK-NEXT: lsrs r0, r4 +; CHECK-NEXT: orrs r0, r2 +; CHECK-NEXT: lsrs r1, r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} + %x3 = add nsw i64 %x2, -1 + %x8 = icmp ne i64 %x2, 0 + %x9 = xor i1 %x8, true + %x10 = zext i1 %x9 to i64 + %x11 = lshr i64 %x2, %x10 + ret i64 %x11 +} diff --git a/llvm/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll b/llvm/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll index b28f4542cf3d22..b00554af680725 100644 --- a/llvm/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll +++ b/llvm/test/CodeGen/Thumb2/2010-03-15-AsmCCClobber.ll @@ -17,7 +17,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32- ; CHECK: bl _f2 ; CHECK: clz {{r[0-9]+}} ; CHECK-DAG: lsrs {{r[0-9]+}} -; CHECK-DAG: lsls {{r[0-9]+}} +; CHECK-DAG: lsl.w {{r[0-9]+}} ; CHECK-NEXT: orr.w {{r[0-9]+}} ; CHECK-NEXT: InlineAsm Start define void @test(%s1* %this, i32 %format, i32 %w, i32 %h, i32 %levels, i32* %s, i8* %data, i32* nocapture %rowbytes, void (i8*, i8*)* %release, i8* %info) nounwind { diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll index d8b94a4a850e56..74596ff399b1a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll @@ -4,15 +4,7 @@ define arm_aapcs_vfpcc <4 x i32> @sext_0246(<8 x i16> %src) { ; CHECK-LABEL: sext_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmovlb.s16 q0, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src, <8 x i16> undef, <4 x i32> @@ -35,15 +27,7 @@ entry: define arm_aapcs_vfpcc <4 x i32> @zext_0246(<8 x i16> %src) { ; CHECK-LABEL: zext_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmovlb.u16 q0, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <8 x i16> %src, <8 x i16> undef, <4 x i32> @@ -66,23 +50,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @sext_02468101214(<16 x i8> %src) { ; CHECK-LABEL: sext_02468101214: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmovlb.s8 q0, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32> @@ -105,23 +73,7 @@ entry: define arm_aapcs_vfpcc <8 x i16> @zext_02468101214(<16 x i8> %src) { ; CHECK-LABEL: zext_02468101214: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmovlb.u8 q0, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/cpus.ll b/llvm/test/CodeGen/WebAssembly/cpus.ll index 8ede6cbb5a716c..01964e9c85abc0 100644 --- a/llvm/test/CodeGen/WebAssembly/cpus.ll +++ b/llvm/test/CodeGen/WebAssembly/cpus.ll @@ -1,16 +1,17 @@ ; This tests that llc accepts all valid WebAssembly CPUs. ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=WASM64 ; CHECK-NOT: is not a recognized processor for this target ; INVALID: {{.+}} is not a recognized processor for this target +; WASM64: 64-bit WebAssembly (wasm64) is not currently supported define i32 @f(i32 %i_like_the_web) { ret i32 %i_like_the_web diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-memop-scalar-unordered.mir b/llvm/test/CodeGen/X86/GlobalISel/select-memop-scalar-unordered.mir index d9016f907d17ab..e963104ca5c049 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-memop-scalar-unordered.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-memop-scalar-unordered.mir @@ -6,17 +6,17 @@ --- | define i8 @test_load_i8(i8* %p1) { - %r = load atomic i8, i8* %p1 unordered, align 8 + %r = load atomic i8, i8* %p1 unordered, align 1 ret i8 %r } define i16 @test_load_i16(i16* %p1) { - %r = load atomic i16, i16* %p1 unordered, align 8 + %r = load atomic i16, i16* %p1 unordered, align 2 ret i16 %r } define i32 @test_load_i32(i32* %p1) { - %r = load atomic i32, i32* %p1 unordered, align 8 + %r = load atomic i32, i32* %p1 unordered, align 4 ret i32 %r } @@ -26,7 +26,7 @@ } define float @test_load_float(float* %p1) { - %r = load atomic float, float* %p1 unordered, align 8 + %r = load atomic float, float* %p1 unordered, align 4 ret float %r } @@ -46,7 +46,7 @@ } define i32* @test_store_i32(i32 %val, i32* %p1) { - store atomic i32 %val, i32* %p1 unordered, align 8 + store atomic i32 %val, i32* %p1 unordered, align 4 ret i32* %p1 } @@ -56,12 +56,12 @@ } define float* @test_store_float(float %val, float* %p1) { - store atomic float %val, float* %p1 unordered, align 8 + store atomic float %val, float* %p1 unordered, align 4 ret float* %p1 } define float* @test_store_float_vec(float %val, float* %p1) { - store atomic float %val, float* %p1 unordered, align 8 + store atomic float %val, float* %p1 unordered, align 4 ret float* %p1 } diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index db4fc2d05751c1..35055a5adca8f8 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -2315,22 +2315,11 @@ define i64 @constant_folding(i64* %p) { ; Legal to forward and fold (TODO) define i64 @load_forwarding(i64* %p) { -; CHECK-O0-LABEL: load_forwarding: -; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: movq (%rdi), %rax -; CHECK-O0-NEXT: orq (%rdi), %rax -; CHECK-O0-NEXT: retq -; -; CHECK-O3-CUR-LABEL: load_forwarding: -; CHECK-O3-CUR: # %bb.0: -; CHECK-O3-CUR-NEXT: movq (%rdi), %rax -; CHECK-O3-CUR-NEXT: orq (%rdi), %rax -; CHECK-O3-CUR-NEXT: retq -; -; CHECK-O3-EX-LABEL: load_forwarding: -; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: movq (%rdi), %rax -; CHECK-O3-EX-NEXT: retq +; CHECK-LABEL: load_forwarding: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: orq (%rdi), %rax +; CHECK-NEXT: retq %v = load atomic i64, i64* %p unordered, align 8 %v2 = load atomic i64, i64* %p unordered, align 8 %ret = or i64 %v, %v2 @@ -2459,8 +2448,8 @@ define i64 @fold_constant_clobber(i64* %p, i64 %arg) { ; CHECK-O3-EX-LABEL: fold_constant_clobber: ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movq %rsi, %rax -; CHECK-O3-EX-NEXT: movq $5, (%rdi) ; CHECK-O3-EX-NEXT: addq {{.*}}(%rip), %rax +; CHECK-O3-EX-NEXT: movq $5, (%rdi) ; CHECK-O3-EX-NEXT: retq %v = load atomic i64, i64* @Constant unordered, align 8 store i64 5, i64* %p @@ -2486,8 +2475,8 @@ define i64 @fold_constant_fence(i64 %arg) { ; CHECK-O3-EX-LABEL: fold_constant_fence: ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movq %rdi, %rax -; CHECK-O3-EX-NEXT: mfence ; CHECK-O3-EX-NEXT: addq {{.*}}(%rip), %rax +; CHECK-O3-EX-NEXT: mfence ; CHECK-O3-EX-NEXT: retq %v = load atomic i64, i64* @Constant unordered, align 8 fence seq_cst @@ -2513,8 +2502,8 @@ define i64 @fold_invariant_clobber(i64* dereferenceable(8) %p, i64 %arg) { ; CHECK-O3-EX-LABEL: fold_invariant_clobber: ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movq %rsi, %rax -; CHECK-O3-EX-NEXT: movq $5, (%rdi) ; CHECK-O3-EX-NEXT: addq (%rdi), %rax +; CHECK-O3-EX-NEXT: movq $5, (%rdi) ; CHECK-O3-EX-NEXT: retq %v = load atomic i64, i64* %p unordered, align 8, !invariant.load !{} store i64 5, i64* %p @@ -2541,8 +2530,8 @@ define i64 @fold_invariant_fence(i64* dereferenceable(8) %p, i64 %arg) { ; CHECK-O3-EX-LABEL: fold_invariant_fence: ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movq %rsi, %rax -; CHECK-O3-EX-NEXT: mfence ; CHECK-O3-EX-NEXT: addq (%rdi), %rax +; CHECK-O3-EX-NEXT: mfence ; CHECK-O3-EX-NEXT: retq %v = load atomic i64, i64* %p unordered, align 8, !invariant.load !{} fence seq_cst @@ -2680,3 +2669,85 @@ define i64 @load_i16_anyext_i64(i16* %ptr) { %res = bitcast <4 x i16> %vec to i64 ret i64 %res } + +; TODO: Would be legal to combine for legal atomic wider types +define i16 @load_combine(i8* %p) { +; CHECK-O0-LABEL: load_combine: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movb (%rdi), %al +; CHECK-O0-NEXT: movb 1(%rdi), %cl +; CHECK-O0-NEXT: movzbl %al, %edx +; CHECK-O0-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-O0-NEXT: movzbl %cl, %esi +; CHECK-O0-NEXT: # kill: def $si killed $si killed $esi +; CHECK-O0-NEXT: shlw $8, %si +; CHECK-O0-NEXT: orw %si, %dx +; CHECK-O0-NEXT: movw %dx, %ax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: load_combine: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movzbl (%rdi), %ecx +; CHECK-O3-NEXT: movzbl 1(%rdi), %eax +; CHECK-O3-NEXT: shll $8, %eax +; CHECK-O3-NEXT: orl %ecx, %eax +; CHECK-O3-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-O3-NEXT: retq + %v1 = load atomic i8, i8* %p unordered, align 2 + %p2 = getelementptr i8, i8* %p, i64 1 + %v2 = load atomic i8, i8* %p2 unordered, align 1 + %v1.ext = zext i8 %v1 to i16 + %v2.ext = zext i8 %v2 to i16 + %v2.sht = shl i16 %v2.ext, 8 + %res = or i16 %v1.ext, %v2.sht + ret i16 %res +} + +define i1 @fold_cmp_over_fence(i32* %p, i32 %v1) { +; CHECK-O0-LABEL: fold_cmp_over_fence: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movl (%rdi), %eax +; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: cmpl %eax, %esi +; CHECK-O0-NEXT: jne .LBB116_2 +; CHECK-O0-NEXT: # %bb.1: # %taken +; CHECK-O0-NEXT: movb $1, %al +; CHECK-O0-NEXT: retq +; CHECK-O0-NEXT: .LBB116_2: # %untaken +; CHECK-O0-NEXT: xorl %eax, %eax +; CHECK-O0-NEXT: # kill: def $al killed $al killed $eax +; CHECK-O0-NEXT: retq +; +; CHECK-O3-CUR-LABEL: fold_cmp_over_fence: +; CHECK-O3-CUR: # %bb.0: +; CHECK-O3-CUR-NEXT: movl (%rdi), %eax +; CHECK-O3-CUR-NEXT: mfence +; CHECK-O3-CUR-NEXT: cmpl %eax, %esi +; CHECK-O3-CUR-NEXT: jne .LBB116_2 +; CHECK-O3-CUR-NEXT: # %bb.1: # %taken +; CHECK-O3-CUR-NEXT: movb $1, %al +; CHECK-O3-CUR-NEXT: retq +; CHECK-O3-CUR-NEXT: .LBB116_2: # %untaken +; CHECK-O3-CUR-NEXT: xorl %eax, %eax +; CHECK-O3-CUR-NEXT: retq +; +; CHECK-O3-EX-LABEL: fold_cmp_over_fence: +; CHECK-O3-EX: # %bb.0: +; CHECK-O3-EX-NEXT: cmpl (%rdi), %esi +; CHECK-O3-EX-NEXT: mfence +; CHECK-O3-EX-NEXT: jne .LBB116_2 +; CHECK-O3-EX-NEXT: # %bb.1: # %taken +; CHECK-O3-EX-NEXT: movb $1, %al +; CHECK-O3-EX-NEXT: retq +; CHECK-O3-EX-NEXT: .LBB116_2: # %untaken +; CHECK-O3-EX-NEXT: xorl %eax, %eax +; CHECK-O3-EX-NEXT: retq + %v2 = load atomic i32, i32* %p unordered, align 4 + fence seq_cst + %cmp = icmp eq i32 %v1, %v2 + br i1 %cmp, label %taken, label %untaken +taken: + ret i1 true +untaken: + ret i1 false +} diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index d3a261e9eb7fb3..609c02eee09022 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -159,18 +159,14 @@ define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone s ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovsd %xmm0, (%eax) -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vbroadcastsd (%ecx), %ymm0 +; X32-NEXT: vmovlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: C2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovsd %xmm0, (%rsi) -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-NEXT: vmovlps %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load double, double* %ptr, align 8 @@ -231,18 +227,14 @@ define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vbroadcastss (%ecx), %ymm0 ; X32-NEXT: vmovss %xmm0, (%eax) -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: D3: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq entry: %q = load float, float* %ptr, align 4 @@ -285,16 +277,14 @@ define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vbroadcastss (%ecx), %xmm0 ; X32-NEXT: vmovss %xmm0, (%eax) -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: e2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vbroadcastss (%rdi), %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: retq entry: %q = load float, float* %ptr, align 4 @@ -669,16 +659,14 @@ define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone s ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovsd %xmm0, (%eax) -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vmovlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: I2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovsd %xmm0, (%rsi) -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vmovlps %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load double, double* %ptr, align 4 @@ -884,7 +872,6 @@ define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) { ; ; Broadcast scale factor for xyz vector - slp will have vectorized xy. -; FIXME: Load as a broadcast and then use the scalar 0'th element. ; define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture readonly) nounwind { ; X32-LABEL: broadcast_scale_xyz: @@ -892,9 +879,8 @@ define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture ; X32-NEXT: subl $12, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; X32-NEXT: vmulpd (%eax), %xmm1, %xmm1 +; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vmulpd (%eax), %xmm0, %xmm1 ; X32-NEXT: vmulsd 16(%eax), %xmm0, %xmm0 ; X32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; X32-NEXT: vaddsd %xmm2, %xmm1, %xmm1 @@ -906,9 +892,8 @@ define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture ; ; X64-LABEL: broadcast_scale_xyz: ; X64: ## %bb.0: -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; X64-NEXT: vmulpd (%rsi), %xmm1, %xmm1 +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1 ; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 29793a7e0bc62d..be88e3530a3c78 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -190,6 +190,52 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { ret <4 x i64>%z } +define <4 x i64> @imulq256_bcast(<4 x i64> %x) { +; AVX512F-LABEL: imulq256_bcast: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: imulq256_bcast: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: imulq256_bcast: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: imulq256_bcast: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: imulq256_bcast: +; SKX: # %bb.0: +; SKX-NEXT: vpmullq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; SKX-NEXT: retq + %z = mul <4 x i64> %x, + ret <4 x i64>%z +} + define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ; AVX512F-LABEL: imulq128: ; AVX512F: # %bb.0: @@ -244,6 +290,54 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ret <2 x i64>%z } +define <2 x i64> @imulq128_bcast(<2 x i64> %x) { +; AVX512F-LABEL: imulq128_bcast: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: imulq128_bcast: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: imulq128_bcast: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: imulq128_bcast: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: imulq128_bcast: +; SKX: # %bb.0: +; SKX-NEXT: vpmullq {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: retq + %z = mul <2 x i64> %x, + ret <2 x i64>%z +} + define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { ; CHECK-LABEL: mulpd512: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 5fb114b3523ae6..b13c27e0d470b1 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -531,211 +531,256 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: pushq %r12 ; KNL-NEXT: pushq %rbx ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kmovw %esi, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k6 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k5 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k4 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %esi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: korw %k2, %k0, %k0 -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 ; KNL-NEXT: xorl %ecx, %ecx ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) ; KNL-NEXT: movl $65535, %edx ## imm = 0xFFFF ; KNL-NEXT: movl $0, %esi ; KNL-NEXT: cmovnel %edx, %esi -; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 ; KNL-NEXT: cmovnel %edx, %ecx +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 ; KNL-NEXT: kmovw %k1, %r8d @@ -832,193 +877,294 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: pushq %r13 ; SKX-NEXT: pushq %r12 ; SKX-NEXT: pushq %rbx -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k0, %k0 -; SKX-NEXT: kshiftrd $30, %k0, %k0 -; SKX-NEXT: kxord %k0, %k2, %k2 -; SKX-NEXT: kshiftrd $2, %k2, %k3 -; SKX-NEXT: kxord %k1, %k3, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $29, %k1, %k1 -; SKX-NEXT: kxord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $31, %k0, %k1 +; SKX-NEXT: kshiftld $2, %k0, %k0 +; SKX-NEXT: kord %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 ; SKX-NEXT: kshiftrd $3, %k1, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $29, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $4, %k1, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k1, %k1 +; SKX-NEXT: kshiftrd $29, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $5, %k1, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k1, %k1 +; SKX-NEXT: kshiftrd $28, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $4, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $5, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $6, %k1, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k1, %k1 +; SKX-NEXT: kshiftrd $27, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $6, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $7, %k1, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k1, %k1 +; SKX-NEXT: kshiftrd $26, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k1, %k1 +; SKX-NEXT: kshiftrd $25, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $7, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $24, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $10, %k1, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k1, %k1 +; SKX-NEXT: kshiftrd $23, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $11, %k1, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k1, %k1 +; SKX-NEXT: kshiftrd $22, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $10, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $11, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $12, %k1, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k1, %k1 +; SKX-NEXT: kshiftrd $21, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $12, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $13, %k1, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k1, %k1 +; SKX-NEXT: kshiftrd $20, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k1, %k1 +; SKX-NEXT: kshiftrd $19, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $13, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $18, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k1, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k1, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $16, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kmovd %esi, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $31, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kmovd %edx, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $3, %k0, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k0, %k0 +; SKX-NEXT: kshiftrd $30, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k2 -; SKX-NEXT: kmovd %esi, %k3 -; SKX-NEXT: kxord %k0, %k3, %k0 -; SKX-NEXT: kshiftrd $2, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $29, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $3, %k0, %k2 -; SKX-NEXT: kmovd %r8d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $4, %k0, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k0, %k0 +; SKX-NEXT: kshiftrd $29, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k2 -; SKX-NEXT: kmovd %r9d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $5, %k0, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $6, %k0, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k0, %k0 +; SKX-NEXT: kshiftrd $27, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $5, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $6, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $7, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k0, %k0 +; SKX-NEXT: kshiftrd $26, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $8, %k0, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $8, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $9, %k0, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $23, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $10, %k0, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k0, %k0 +; SKX-NEXT: kshiftrd $23, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $22, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $11, %k0, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k0, %k0 +; SKX-NEXT: kshiftrd $22, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $12, %k0, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k0, %k0 +; SKX-NEXT: kshiftrd $21, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $11, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $12, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $13, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k0, %k0 +; SKX-NEXT: kshiftrd $20, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $14, %k0, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $14, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $15, %k0, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $17, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k0, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $16, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k0, %k0 +; SKX-NEXT: kshiftrd $16, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r8d @@ -1113,215 +1259,262 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: pushl %ebx ; KNL_X32-NEXT: pushl %edi ; KNL_X32-NEXT: pushl %esi +; KNL_X32-NEXT: subl $20, %esp ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 ; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $3, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $4, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $5, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $6, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $7, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k6 +; KNL_X32-NEXT: korw %k1, %k6, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k5 +; KNL_X32-NEXT: korw %k1, %k5, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k4 +; KNL_X32-NEXT: korw %k1, %k4, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $9, %k0, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $10, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $11, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $12, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $2, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 ; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k1 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $9, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $10, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $11, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $12, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k6, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $13, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k5, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k4, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $1, %k1, %k1 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k3, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: xorl %eax, %eax ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl $65535, %ecx ## imm = 0xFFFF ; KNL_X32-NEXT: movl $0, %edx ; KNL_X32-NEXT: cmovnel %ecx, %edx +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %bl +; KNL_X32-NEXT: kmovw %ebx, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %edx, %k1 ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: cmovnel %ecx, %eax -; KNL_X32-NEXT: kandw %k0, %k1, %k0 -; KNL_X32-NEXT: kmovw %edx, %k1 +; KNL_X32-NEXT: kmovw (%esp), %k2 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kandw %k1, %k2, %k1 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1403,6 +1596,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: orl %esi, %ecx ; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) +; KNL_X32-NEXT: addl $20, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: popl %edi ; KNL_X32-NEXT: popl %ebx @@ -1416,356 +1610,550 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL-LABEL: test17: ; KNL: ## %bb.0: ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $3, %k3, %k3 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $4, %k3, %k3 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $5, %k3, %k3 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $6, %k3, %k3 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k3 +; KNL-NEXT: kshiftlw $7, %k3, %k3 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: korw %k0, %k3, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $14, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $3, %k4, %k4 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $4, %k4, %k4 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $5, %k4, %k4 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kshiftlw $7, %k4, %k4 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k5 -; KNL-NEXT: kshiftrw $2, %k5, %k6 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: korw %k0, %k4, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $14, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $3, %k5, %k5 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $12, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $4, %k5, %k5 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $11, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $5, %k5, %k5 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $10, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $6, %k5, %k5 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k5 +; KNL-NEXT: kshiftlw $7, %k5, %k5 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 ; KNL-NEXT: kshiftrw $9, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k1, %k6, %k6 -; KNL-NEXT: kshiftrw $2, %k6, %k7 -; KNL-NEXT: kxorw %k0, %k7, %k0 +; KNL-NEXT: korw %k0, %k5, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $14, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $3, %k6, %k6 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $4, %k6, %k6 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $5, %k6, %k6 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $6, %k6, %k6 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $10, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k6 +; KNL-NEXT: kshiftlw $7, %k6, %k6 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: korw %k0, %k6, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil ; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kshiftrw $2, %k0, %k2 -; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $14, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $13, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $12, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $11, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $10, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $9, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k7 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k7 -; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k1, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kandw %k7, %k0, %k0 ; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kandw %k3, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload @@ -1808,300 +2196,488 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-LABEL: test17: ; SKX: ## %bb.0: ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k0, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: korb %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftlb $6, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $4, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftlb $5, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $3, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $6, %k1, %k2 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftlb $2, %k1, %k1 +; SKX-NEXT: kshiftrb $2, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $1, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kandb %k1, %k2, %k1 -; SKX-NEXT: kxorb %k0, %k4, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kshiftrb $4, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kandb %k2, %k3, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 -; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 ; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 -; SKX-NEXT: kxorb %k0, %k7, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kmovd %esi, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $7, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kmovd %edx, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k5 -; SKX-NEXT: kmovd %esi, %k6 -; SKX-NEXT: kxorb %k0, %k6, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $3, %k0, %k5 -; SKX-NEXT: kmovd %r8d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k0, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $4, %k0, %k5 -; SKX-NEXT: kmovd %r9d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $5, %k0, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: korb %k0, %k5, %k0 ; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kandb %k2, %k0, %k0 @@ -2144,362 +2720,557 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-LABEL: test17: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: pushl %ebx -; KNL_X32-NEXT: subl $8, %esp +; KNL_X32-NEXT: pushl %eax ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $2, %k3, %k4 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $3, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $5, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $6, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $7, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 ; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k3, %k3 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k1, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $2, %k4, %k5 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $14, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $3, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $4, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $6, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $7, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 ; KNL_X32-NEXT: kshiftrw $9, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k4 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k1, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $2, %k5, %k6 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $14, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $3, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $4, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $5, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $7, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 ; KNL_X32-NEXT: kshiftrw $9, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k5 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k1, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $2, %k6, %k7 -; KNL_X32-NEXT: kxorw %k0, %k7, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $14, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $3, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $4, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $5, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $6, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 ; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $3, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $13, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $4, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $12, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $5, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $11, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $6, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $10, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $7, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $9, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k7 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $13, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $12, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $11, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $10, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kandw %k7, %k0, %k0 ; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload -; KNL_X32-NEXT: kandw %k1, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw (%esp), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 @@ -2537,7 +3308,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andb $127, %cl ; KNL_X32-NEXT: movb %cl, (%eax) -; KNL_X32-NEXT: addl $8, %esp +; KNL_X32-NEXT: addl $4, %esp ; KNL_X32-NEXT: popl %ebx ; KNL_X32-NEXT: retl $4 %j = and <7 x i1> %a, %b diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index 20af8194825638..fcb07a504067b9 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -1886,410 +1886,495 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: # %bb.0: -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k2, %k7 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k6, %k2, %k5 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k4 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k5, %k2, %k4 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k4, %k2, %k3 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k0, %k4 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k4} {z} +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k2 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm4, %ymm4 ; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k3} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm5, %ymm4 ; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm6, %ymm4 ; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm7, %ymm4 ; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 ; KNL-NEXT: retq ; @@ -2304,410 +2389,495 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; ; AVX512DQNOBW-LABEL: test21: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: kmovw %edx, %k0 -; AVX512DQNOBW-NEXT: kmovw %edi, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQNOBW-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kmovw %esi, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQNOBW-NEXT: kmovw %edx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k3, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQNOBW-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %ecx, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r8d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r9d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k4, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQNOBW-NEXT: kmovw %r8d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k5, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k6 +; AVX512DQNOBW-NEXT: kmovw %r9d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k6, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k7, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k0, %k1, %k0 +; AVX512DQNOBW-NEXT: korw %k0, %k2, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQNOBW-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQNOBW-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQNOBW-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k6, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k1, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k3, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k4, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k5, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k6, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k6, %k1, %k5 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k5, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k5, %k1, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k4, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k4, %k1, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k3, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 ; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX512DQNOBW-NEXT: vpmovm2d %k2, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm6, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3 -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm7, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0 ; AVX512DQNOBW-NEXT: retq %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index d37220222ce772..6e36bd1bb0eb15 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -302,12 +302,15 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $10, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $ax killed $ax killed $eax ; KNL-NEXT: retq @@ -316,11 +319,14 @@ define i16 @test16(i1 *%addr, i16 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrw $10, %k1, %k2 -; SKX-NEXT: kxorw %k0, %k2, %k0 +; SKX-NEXT: kshiftrw $11, %k1, %k2 +; SKX-NEXT: kshiftlw $11, %k2, %k2 +; SKX-NEXT: kshiftlw $6, %k1, %k1 +; SKX-NEXT: kshiftrw $6, %k1, %k1 ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $5, %k0, %k0 -; SKX-NEXT: kxorw %k0, %k1, %k0 +; SKX-NEXT: korw %k0, %k2, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $ax killed $ax killed $eax ; SKX-NEXT: retq @@ -336,12 +342,15 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: retq @@ -350,11 +359,14 @@ define i8 @test17(i1 *%addr, i8 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k0, %k2, %k0 +; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $3, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k1, %k0 +; SKX-NEXT: korb %k0, %k2, %k0 +; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq @@ -790,12 +802,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: orl %ecx, %eax ; KNL-NEXT: vzeroupper @@ -808,12 +823,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckwd %k0, %k1, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k1 +; SKX-NEXT: kshiftrd $5, %k0, %k1 +; SKX-NEXT: kshiftld $5, %k1, %k1 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 ; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $27, %k1, %k1 -; SKX-NEXT: kxord %k1, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $27, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -832,12 +850,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -848,12 +869,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k1 -; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k0, %k1 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index a526518c3fe69d..e7f132bcdc6763 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2250,8 +2250,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_add_epi64_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xd4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_add_epi64_rmb: @@ -2269,10 +2268,9 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; X86-LABEL: test_mask_add_epi64_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -2293,10 +2291,9 @@ define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) ; X86-LABEL: test_mask_add_epi64_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_add_epi64_rmbkz: @@ -2418,8 +2415,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_sub_epi64_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xfb,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_sub_epi64_rmb: @@ -2437,10 +2433,9 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; X86-LABEL: test_mask_sub_epi64_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpsubq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -2461,10 +2456,9 @@ define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) ; X86-LABEL: test_mask_sub_epi64_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_sub_epi64_rmbkz: diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index ea9742a5762169..8c86d957d4ca2a 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -484,8 +484,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; KNL-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2 ; KNL-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpcmpleq %zmm1, %zmm0, %k1 -; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 {%k1} +; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 +; KNL-NEXT: vpcmpleq %zmm1, %zmm0, %k1 {%k1} ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL-NEXT: vzeroupper @@ -493,8 +493,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; ; SKX-LABEL: test4: ; SKX: ## %bb.0: -; SKX-NEXT: vpcmpleq %ymm1, %ymm0, %k1 -; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} +; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 +; SKX-NEXT: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -505,8 +505,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; AVX512BW-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpcmpleq %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 {%k1} +; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpcmpleq %zmm1, %zmm0, %k1 {%k1} ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -518,8 +518,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; AVX512DQ-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512DQ-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512DQ-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vpcmpleq %zmm1, %zmm0, %k1 -; AVX512DQ-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1} +; AVX512DQ-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 +; AVX512DQ-NEXT: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -527,8 +527,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; ; X86-LABEL: test4: ; X86: ## %bb.0: -; X86-NEXT: vpcmpleq %ymm1, %ymm0, %k1 -; X86-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} +; X86-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 +; X86-NEXT: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ; X86-NEXT: vpmovm2d %k0, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -546,8 +546,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; KNL-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpcmpleq %zmm3, %zmm2, %k1 -; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 {%k1} +; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; KNL-NEXT: vpcmpleq %zmm3, %zmm2, %k1 {%k1} ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL-NEXT: vzeroupper @@ -555,8 +555,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; ; SKX-LABEL: test5: ; SKX: ## %bb.0: -; SKX-NEXT: vpcmpleq %xmm3, %xmm2, %k1 -; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1} +; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 +; SKX-NEXT: vpcmpleq %xmm3, %xmm2, %k0 {%k1} ; SKX-NEXT: vpmovm2q %k0, %xmm0 ; SKX-NEXT: retq ; @@ -566,8 +566,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; AVX512BW-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpcmpleq %zmm3, %zmm2, %k1 -; AVX512BW-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 {%k1} +; AVX512BW-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; AVX512BW-NEXT: vpcmpleq %zmm3, %zmm2, %k1 {%k1} ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -579,8 +579,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; AVX512DQ-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512DQ-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpcmpleq %zmm3, %zmm2, %k1 -; AVX512DQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 {%k1} +; AVX512DQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; AVX512DQ-NEXT: vpcmpleq %zmm3, %zmm2, %k0 {%k1} ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -588,8 +588,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; ; X86-LABEL: test5: ; X86: ## %bb.0: -; X86-NEXT: vpcmpleq %xmm3, %xmm2, %k1 -; X86-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 +; X86-NEXT: vpcmpleq %xmm3, %xmm2, %k0 {%k1} ; X86-NEXT: vpmovm2q %k0, %xmm0 ; X86-NEXT: retl %x_gt_y = icmp slt <2 x i64> %x, %y @@ -1069,12 +1069,16 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: kxnorw %k0, %k0, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: movb $1, %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1091,24 +1095,32 @@ define <64 x i8> @test16(i64 %x) { ; SKX-LABEL: test16: ; SKX: ## %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: movb $1, %al +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test16: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1116,27 +1128,31 @@ define <64 x i8> @test16(i64 %x) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 -; AVX512DQ-NEXT: kxnorw %k0, %k0, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 +; AVX512DQ-NEXT: movb $1, %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1145,12 +1161,16 @@ define <64 x i8> @test16(i64 %x) { ; X86-LABEL: test16: ; X86: ## %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 -; X86-NEXT: kshiftrq $5, %k0, %k1 -; X86-NEXT: kxnorw %k0, %k0, %k2 -; X86-NEXT: kxorq %k2, %k1, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: movb $1, %al +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1174,12 +1194,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: cmpl %edx, %esi ; KNL-NEXT: setg %al -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1198,12 +1221,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; @@ -1212,12 +1238,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi ; AVX512BW-NEXT: setg %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1225,29 +1254,32 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 ; AVX512DQ-NEXT: cmpl %edx, %esi ; AVX512DQ-NEXT: setg %al -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1259,12 +1291,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setg %al -; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: kshiftrq $5, %k0, %k2 -; X86-NEXT: kxorq %k1, %k2, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1281,10 +1316,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kshiftrw $8, %k1, %k2 ; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k3 ; KNL-NEXT: kshiftlw $6, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $9, %k0, %k0 ; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: kshiftlw $7, %k2, %k1 @@ -1301,10 +1338,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; SKX-NEXT: kmovd %esi, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: kshiftrw $9, %k1, %k1 -; SKX-NEXT: kshiftrb $6, %k0, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k3 ; SKX-NEXT: kshiftlb $6, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k0, %k0 +; SKX-NEXT: korb %k1, %k3, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 ; SKX-NEXT: kshiftrb $1, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 @@ -1318,10 +1357,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 -; AVX512BW-NEXT: kxorw %k1, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $6, %k1, %k1 -; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 @@ -1337,10 +1378,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 ; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQ-NEXT: kshiftrb $6, %k0, %k3 -; AVX512DQ-NEXT: kxorb %k1, %k3, %k1 +; AVX512DQ-NEXT: kshiftlb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k3 ; AVX512DQ-NEXT: kshiftlb $6, %k1, %k1 -; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 +; AVX512DQ-NEXT: korb %k1, %k3, %k1 +; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 @@ -1357,10 +1400,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kshiftrw $8, %k1, %k2 ; X86-NEXT: kshiftrw $9, %k1, %k1 -; X86-NEXT: kshiftrb $6, %k0, %k3 -; X86-NEXT: kxorb %k1, %k3, %k1 +; X86-NEXT: kshiftlb $7, %k0, %k3 +; X86-NEXT: kshiftlb $2, %k0, %k0 +; X86-NEXT: kshiftrb $2, %k0, %k0 ; X86-NEXT: kshiftlb $6, %k1, %k1 -; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: korb %k1, %k3, %k1 +; X86-NEXT: korb %k1, %k0, %k0 ; X86-NEXT: kshiftlb $1, %k0, %k0 ; X86-NEXT: kshiftrb $1, %k0, %k0 ; X86-NEXT: kshiftlb $7, %k2, %k1 @@ -2748,403 +2793,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; KNL-LABEL: store_64i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw %ecx, %k0 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 ; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k2, %k7, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: korw %k6, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k6, %k1, %k5 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k5, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k5, %k1, %k4 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: korw %k4, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k4, %k1, %k3 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k3, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kmovw %k1, 6(%rdi) -; KNL-NEXT: kmovw %k3, 4(%rdi) -; KNL-NEXT: kmovw %k2, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 4(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; KNL-NEXT: kmovw %k0, (%rdi) ; KNL-NEXT: retq ; @@ -3166,403 +3296,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; AVX512DQ-LABEL: store_64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovw %ecx, %k0 -; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQ-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k0 ; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: kmovw %edx, %k1 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k3, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQ-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r8d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r9d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQ-NEXT: kmovw %r8d, %k1 +; AVX512DQ-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k4, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQ-NEXT: kmovw %r9d, %k1 +; AVX512DQ-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k5, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k6 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k6, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k7, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k0, %k1, %k0 +; AVX512DQ-NEXT: korw %k0, %k2, %k0 ; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQ-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQ-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQ-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQ-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQ-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k6, %k7 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k1, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k3, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k4, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k5, %k6 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k6, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k6 +; AVX512DQ-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k6, %k1, %k5 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k5, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k5, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k4, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k4, %k1, %k3 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k3, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 ; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k1, %k1 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k1, %k1 ; AVX512DQ-NEXT: kmovw %k1, 6(%rdi) -; AVX512DQ-NEXT: kmovw %k3, 4(%rdi) -; AVX512DQ-NEXT: kmovw %k2, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll index 57cabca007661b..065a55fe7c51be 100644 --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -750,8 +750,7 @@ define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) { ; ; SKX-LABEL: usat_trunc_db_256: ; SKX: ## %bb.0: -; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; SKX-NEXT: vpmovdb %ymm0, %xmm0 +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %tmp1 = icmp ult <8 x i32> %x, diff --git a/llvm/test/CodeGen/X86/avx512-vselect.ll b/llvm/test/CodeGen/X86/avx512-vselect.ll index d61e4e13df9ca9..07e5aeac015bc8 100644 --- a/llvm/test/CodeGen/X86/avx512-vselect.ll +++ b/llvm/test/CodeGen/X86/avx512-vselect.ll @@ -47,3 +47,159 @@ entry: %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b ret <16 x double> %ret } + +define <16 x i64> @test3(<16 x i8> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test3: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k1 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test3: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 +; CHECK-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i8> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <16 x i64> @test4(<16 x i16> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test4: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vptestnmw %ymm0, %ymm0, %k1 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test4: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqw %ymm5, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i16> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <16 x i64> @test5(<16 x i32> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: +; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; CHECK-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-NEXT: retq + %c = icmp eq <16 x i32> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <32 x i32> @test6(<32 x i8> %x, <32 x i32> %a, <32 x i32> %b) { +; CHECK-SKX-LABEL: test6: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vptestnmb %ymm0, %ymm0, %k1 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrd $16, %k1, %k1 +; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test6: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm0, %xmm5 +; CHECK-KNL-NEXT: vpmovsxbd %xmm5, %zmm5 +; CHECK-KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <32 x i8> %x, zeroinitializer + %ret = select <32 x i1> %c, <32 x i32> %a, <32 x i32> %b + ret <32 x i32> %ret +} + +define <32 x i32> @test7(<32 x i16> %x, <32 x i32> %a, <32 x i32> %b) { +; CHECK-SKX-LABEL: test7: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vptestnmw %zmm0, %zmm0, %k1 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrd $16, %k1, %k1 +; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test7: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-KNL-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; CHECK-KNL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm5 +; CHECK-KNL-NEXT: vpmovsxwd %ymm5, %zmm5 +; CHECK-KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <32 x i16> %x, zeroinitializer + %ret = select <32 x i1> %c, <32 x i32> %a, <32 x i32> %b + ret <32 x i32> %ret +} + +define <64 x i16> @test8(<64 x i8> %x, <64 x i16> %a, <64 x i16> %b) { +; CHECK-SKX-LABEL: test8: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vptestnmb %zmm0, %zmm0, %k1 +; CHECK-SKX-NEXT: vpblendmw %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-SKX-NEXT: kshiftrq $32, %k1, %k1 +; CHECK-SKX-NEXT: vpblendmw %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test8: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: pushq %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa_offset 16 +; CHECK-KNL-NEXT: .cfi_offset %rbp, -16 +; CHECK-KNL-NEXT: movq %rsp, %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa_register %rbp +; CHECK-KNL-NEXT: andq $-32, %rsp +; CHECK-KNL-NEXT: subq $32, %rsp +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; CHECK-KNL-NEXT: vmovdqa 16(%rbp), %ymm9 +; CHECK-KNL-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm0, %ymm11 +; CHECK-KNL-NEXT: vpmovsxbw %xmm11, %ymm0 +; CHECK-KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm0 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm11, %xmm1 +; CHECK-KNL-NEXT: vpmovsxbw %xmm1, %ymm1 +; CHECK-KNL-NEXT: vpblendvb %ymm1, %ymm2, %ymm6, %ymm1 +; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm8, %ymm5 +; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm2 +; CHECK-KNL-NEXT: vpblendvb %ymm2, %ymm3, %ymm7, %ymm2 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm5, %xmm3 +; CHECK-KNL-NEXT: vpmovsxbw %xmm3, %ymm3 +; CHECK-KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm9, %ymm3 +; CHECK-KNL-NEXT: movq %rbp, %rsp +; CHECK-KNL-NEXT: popq %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-KNL-NEXT: retq + %c = icmp eq <64 x i8> %x, zeroinitializer + %ret = select <64 x i1> %c, <64 x i16> %a, <64 x i16> %b + ret <64 x i16> %ret +} diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 63402d8019937c..b645098582ff3d 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -2011,8 +2011,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x58,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_512: @@ -2030,9 +2029,8 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x59,0x40,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2053,9 +2051,8 @@ define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_512: @@ -2172,8 +2169,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] -; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x38,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_256: @@ -2191,9 +2187,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x39,0x40,0x08] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2214,9 +2209,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_256: @@ -2334,8 +2328,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] -; X86-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_128: @@ -2353,9 +2346,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x40,0x08] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2376,9 +2368,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_128: diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll index 5b90bdb8311bf4..bf85814f31973d 100644 --- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll @@ -199,8 +199,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xb5,0xc2] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: @@ -236,8 +235,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6 ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 # encoding: [0x62,0xf2,0xed,0x48,0xb5,0xc1] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: @@ -276,10 +274,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: @@ -319,10 +316,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: @@ -362,10 +358,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: @@ -405,10 +400,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(< ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xed,0xc9,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: diff --git a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll index 077269fde95ccc..6884666a296ce1 100644 --- a/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -219,8 +219,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xb5,0xc2] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: @@ -256,8 +255,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6 ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 # encoding: [0x62,0xf2,0xed,0x48,0xb5,0xc1] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: @@ -298,10 +296,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: @@ -345,10 +342,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: @@ -392,10 +388,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: @@ -439,10 +434,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(< ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xed,0xc9,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 8684d1f568fdb8..233b9162c92625 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -3922,10 +3922,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 ; X86-LABEL: test_mask_andnot_epi64_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x08] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3946,10 +3945,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rmbkz_128: @@ -4089,10 +4087,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 ; X86-LABEL: test_mask_andnot_epi64_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x08] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4113,10 +4110,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rmbkz_256: diff --git a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll index c4c70fcb2b5c26..fe3662d49aa53f 100644 --- a/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll @@ -229,8 +229,7 @@ define void @test_mm256_2intersect_epi64_b(i64* nocapture readonly %a, i64* noca ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] ; X86-NEXT: vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06] -; X86-NEXT: vbroadcastsd (%edx), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x0a] -; X86-NEXT: vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1] +; X86-NEXT: vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] ; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] @@ -535,9 +534,7 @@ define void @test_mm_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocaptu ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] ; X86-NEXT: vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06] ; X86-NEXT: # xmm0 = mem[0,0] -; X86-NEXT: vmovddup (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x0a] -; X86-NEXT: # xmm1 = mem[0,0] -; X86-NEXT: vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1] +; X86-NEXT: vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02] ; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] ; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] diff --git a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll index 3e37c7c5b6ac08..7e7a46db75edcd 100644 --- a/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll @@ -199,8 +199,7 @@ define void @test_mm512_2intersect_epi64_b(i64* nocapture readonly %a, i64* noca ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] ; X86-NEXT: vbroadcastsd (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x02] -; X86-NEXT: vbroadcastsd (%ecx), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x09] -; X86-NEXT: vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1] +; X86-NEXT: vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] ; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll index b982cde2a957b1..e50dca9646567f 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -55,17 +55,17 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; ; AVX1-LABEL: v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskpd %ymm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper @@ -73,9 +73,9 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; ; AVX2-LABEL: v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovmskpd %ymm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -121,9 +121,9 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> ; ; AVX12-LABEL: v4f64: ; AVX12: # %bb.0: +; AVX12-NEXT: vcmpltpd %ymm2, %ymm3, %ymm2 ; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX12-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1 -; AVX12-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX12-NEXT: vandpd %ymm2, %ymm0, %ymm0 ; AVX12-NEXT: vmovmskpd %ymm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: vzeroupper @@ -241,32 +241,28 @@ define i8 @v8i32_and(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; ; AVX1-LABEL: v8i32_and: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32_and: ; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -314,32 +310,28 @@ define i8 @v8i32_or(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; ; AVX1-LABEL: v8i32_or: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32_or: ; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -395,42 +387,36 @@ define i8 @v8i32_or_and(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d, ; ; AVX1-LABEL: v8i32_or_and: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 +; AVX1-NEXT: vpcmpgtd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i32_or_and: ; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm4, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -482,13 +468,10 @@ define i8 @v8f32_and(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> ; ; AVX12-LABEL: v8f32_and: ; AVX12: # %bb.0: +; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm2 ; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 -; AVX12-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX12-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX12-NEXT: vmovmskps %ymm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq @@ -536,13 +519,10 @@ define i8 @v8f32_xor(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> ; ; AVX12-LABEL: v8f32_xor: ; AVX12: # %bb.0: +; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm2 ; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm1 -; AVX12-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX12-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX12-NEXT: vmovmskps %ymm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq @@ -604,17 +584,12 @@ define i8 @v8f32_xor_and(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x fl ; ; AVX12-LABEL: v8f32_xor_and: ; AVX12: # %bb.0: +; AVX12-NEXT: vcmpeq_uqps %ymm3, %ymm2, %ymm2 ; AVX12-NEXT: vcmpnleps %ymm1, %ymm0, %ymm0 -; AVX12-NEXT: vcmpeq_uqps %ymm3, %ymm2, %ymm1 -; AVX12-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX12-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX12-NEXT: vcmpltps %ymm4, %ymm5, %ymm1 -; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX12-NEXT: vmovmskps %ymm0, %eax ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll index 9384b24ab28b9d..29499848eb480e 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -50,56 +50,45 @@ define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) { ; ; AVX1-LABEL: v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9 ; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm8, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: v8i64: ; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpgtq %ymm7, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpackssdw %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq %ymm7, %ymm5, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovmskps %ymm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -171,29 +160,42 @@ define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; -; AVX12-LABEL: v8f64: -; AVX12: # %bb.0: -; AVX12-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 -; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX12-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX12-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 -; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX12-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vcmpltpd %ymm5, %ymm7, %ymm1 -; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX12-NEXT: vcmpltpd %ymm4, %ymm6, %ymm2 -; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX12-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX12-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: # kill: def $al killed $al killed $eax -; AVX12-NEXT: vzeroupper -; AVX12-NEXT: retq +; AVX1-LABEL: v8f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltpd %ymm5, %ymm7, %ymm5 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpackssdw %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vcmpltpd %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vmovmskps %ymm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v8f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vcmpltpd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vcmpltpd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpackssdw %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovmskps %ymm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: v8f64: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll b/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll index ae07f84343d4fe..9c408c70cfbf49 100644 --- a/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll +++ b/llvm/test/CodeGen/X86/constrained-fp80-trunc-ext.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O3 -mtriple=x86_64-gnu-linux < %s | FileCheck %s -define x86_fp80 @constrained_fpext_f32_as_fp80(float %mem) { +define x86_fp80 @constrained_fpext_f32_as_fp80(float %mem) #0 { ; CHECK-LABEL: constrained_fpext_f32_as_fp80: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp) @@ -10,11 +10,11 @@ define x86_fp80 @constrained_fpext_f32_as_fp80(float %mem) { entry: %ext = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f32( float %mem, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret x86_fp80 %ext } -define float @constrained_fptrunc_f80_to_f32(x86_fp80 %reg) { +define float @constrained_fptrunc_f80_to_f32(x86_fp80 %reg) #0 { ; CHECK-LABEL: constrained_fptrunc_f80_to_f32: ; CHECK: # %bb.0: ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) @@ -24,11 +24,11 @@ define float @constrained_fptrunc_f80_to_f32(x86_fp80 %reg) { %trunc = call float @llvm.experimental.constrained.fptrunc.f32.f80( x86_fp80 %reg, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %trunc } -define x86_fp80 @constrained_fpext_f64_to_f80(double %mem) { +define x86_fp80 @constrained_fpext_f64_to_f80(double %mem) #0 { ; CHECK-LABEL: constrained_fpext_f64_to_f80: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp) @@ -37,11 +37,11 @@ define x86_fp80 @constrained_fpext_f64_to_f80(double %mem) { entry: %ext = call x86_fp80 @llvm.experimental.constrained.fpext.f80.f64( double %mem, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret x86_fp80 %ext } -define double @constrained_fptrunc_f80_to_f64(x86_fp80 %reg) { +define double @constrained_fptrunc_f80_to_f64(x86_fp80 %reg) #0 { ; CHECK-LABEL: constrained_fptrunc_f80_to_f64: ; CHECK: # %bb.0: ; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) @@ -51,10 +51,12 @@ define double @constrained_fptrunc_f80_to_f64(x86_fp80 %reg) { %trunc = call double @llvm.experimental.constrained.fptrunc.f64.f80( x86_fp80 %reg, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %trunc } +attributes #0 = { strictfp } + declare x86_fp80 @llvm.experimental.constrained.fpext.f80.f32(float, metadata) declare x86_fp80 @llvm.experimental.constrained.fpext.f80.f64(double, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f80(x86_fp80, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll index cbb1c386a62c1c..2f5224eaf6c2fe 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -11,13 +11,13 @@ ; ; CHECK-LABEL: f1 ; COMMON: divsd -define double @f1() { +define double @f1() #0 { entry: %div = call double @llvm.experimental.constrained.fdiv.f64( double 1.000000e+00, double 1.000000e+01, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %div } @@ -31,13 +31,13 @@ entry: ; ; CHECK-LABEL: f2 ; COMMON: subsd -define double @f2(double %a) { +define double @f2(double %a) #0 { entry: %sub = call double @llvm.experimental.constrained.fsub.f64( double %a, double 0.000000e+00, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %sub } @@ -54,21 +54,21 @@ entry: ; COMMON: subsd ; COMMON: mulsd ; COMMON: subsd -define double @f3(double %a, double %b) { +define double @f3(double %a, double %b) #0 { entry: %sub = call double @llvm.experimental.constrained.fsub.f64( double -0.000000e+00, double %a, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul = call double @llvm.experimental.constrained.fmul.f64( double %sub, double %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %ret = call double @llvm.experimental.constrained.fsub.f64( double -0.000000e+00, double %mul, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } @@ -87,7 +87,7 @@ entry: ; COMMON: testl ; COMMON: jle ; COMMON: addsd -define double @f4(i32 %n, double %a) { +define double @f4(i32 %n, double %a) #0 { entry: %cmp = icmp sgt i32 %n, 0 br i1 %cmp, label %if.then, label %if.end @@ -96,7 +96,7 @@ if.then: %add = call double @llvm.experimental.constrained.fadd.f64( double 1.000000e+00, double %a, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 br label %if.end if.end: @@ -107,112 +107,112 @@ if.end: ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f5 ; COMMON: sqrtsd -define double @f5() { +define double @f5() #0 { entry: %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f6 ; COMMON: pow -define double @f6() { +define double @f6() #0 { entry: %result = call double @llvm.experimental.constrained.pow.f64(double 42.1, double 3.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f7 ; COMMON: powi -define double @f7() { +define double @f7() #0 { entry: %result = call double @llvm.experimental.constrained.powi.f64(double 42.1, i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that sin(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f8 ; COMMON: sin -define double @f8() { +define double @f8() #0 { entry: %result = call double @llvm.experimental.constrained.sin.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that cos(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f9 ; COMMON: cos -define double @f9() { +define double @f9() #0 { entry: %result = call double @llvm.experimental.constrained.cos.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f10 ; COMMON: exp -define double @f10() { +define double @f10() #0 { entry: %result = call double @llvm.experimental.constrained.exp.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f11 ; COMMON: exp2 -define double @f11() { +define double @f11() #0 { entry: %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that log(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f12 ; COMMON: log -define double @f12() { +define double @f12() #0 { entry: %result = call double @llvm.experimental.constrained.log.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that log10(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f13 ; COMMON: log10 -define double @f13() { +define double @f13() #0 { entry: %result = call double @llvm.experimental.constrained.log10.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that log2(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f14 ; COMMON: log2 -define double @f14() { +define double @f14() #0 { entry: %result = call double @llvm.experimental.constrained.log2.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } @@ -220,11 +220,11 @@ entry: ; CHECK-LABEL: f15 ; NO-FMA: rint ; HAS-FMA: vroundsd -define double @f15() { +define double @f15() #0 { entry: %result = call double @llvm.experimental.constrained.rint.f64(double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } @@ -233,12 +233,12 @@ entry: ; CHECK-LABEL: f16 ; NO-FMA: nearbyint ; HAS-FMA: vroundsd -define double @f16() { +define double @f16() #0 { entry: %result = call double @llvm.experimental.constrained.nearbyint.f64( double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } @@ -247,14 +247,14 @@ entry: ; CHECK-LABEL: f17 ; FMACALL32: jmp fmaf # TAILCALL ; FMA32: vfmadd213ss -define float @f17() { +define float @f17() #0 { entry: %result = call float @llvm.experimental.constrained.fma.f32( float 3.5, float 3.5, float 3.5, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %result } @@ -263,26 +263,26 @@ entry: ; CHECK-LABEL: f18 ; FMACALL64: jmp fma # TAILCALL ; FMA64: vfmadd213sd -define double @f18() { +define double @f18() #0 { entry: %result = call double @llvm.experimental.constrained.fma.f64( double 42.1, double 42.1, double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; CHECK-LABEL: f19 ; COMMON: fmod -define double @f19() { +define double @f19() #0 { entry: %rem = call double @llvm.experimental.constrained.frem.f64( double 1.000000e+00, double 1.000000e+01, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %rem } @@ -312,10 +312,10 @@ entry: ; HAS-FMA: setae ; HAS-FMA: shll ; HAS-FMA: xorl -define i32 @f20u(double %x) { +define i32 @f20u(double %x) #0 { entry: %result = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %x, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %result } @@ -324,24 +324,102 @@ entry: ; Verify that no gross errors happen. ; CHECK-LABEL: @f21 ; COMMON: cvtsd2ss -define float @f21() { +define float @f21() #0 { entry: %result = call float @llvm.experimental.constrained.fptrunc.f32.f64( double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %result } ; CHECK-LABEL: @f22 ; COMMON: cvtss2sd -define double @f22(float %x) { +define double @f22(float %x) #0 { entry: %result = call double @llvm.experimental.constrained.fpext.f64.f32(float %x, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } +; CHECK-LABEL: f23 +; COMMON: jmp lrint +define i32 @f23(double %x) #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f64(double %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; CHECK-LABEL: f24 +; COMMON: jmp lrintf +define i32 @f24(float %x) #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f32(float %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; CHECK-LABEL: f25 +; COMMON: jmp llrint +define i64 @f25(double %x) #0 { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f64(double %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +; CHECK-LABEL: f26 +; COMMON: jmp llrintf +define i64 @f26(float %x) { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f32(float %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +; CHECK-LABEL: f27 +; COMMON: jmp lround +define i32 @f27(double %x) #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; CHECK-LABEL: f28 +; COMMON: jmp lroundf +define i32 @f28(float %x) #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; CHECK-LABEL: f29 +; COMMON: jmp llround +define i64 @f29(double %x) #0 { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +; CHECK-LABEL: f30 +; COMMON: jmp llroundf +define i64 @f30(float %x) #0 { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +attributes #0 = { strictfp } + @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) @@ -366,3 +444,11 @@ declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f64(double, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f64(double, metadata, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f32(float, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f64(double, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f32(float, metadata, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f64(double, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) diff --git a/llvm/test/CodeGen/X86/implicit-null-check.ll b/llvm/test/CodeGen/X86/implicit-null-check.ll index 5b0790f699aacf..6d6b31f86dbe97 100644 --- a/llvm/test/CodeGen/X86/implicit-null-check.ll +++ b/llvm/test/CodeGen/X86/implicit-null-check.ll @@ -432,14 +432,70 @@ define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) { ret i32 %z } +;; TODO: We could handle this case as we can lift the fence into the +;; previous block before the conditional without changing behavior. +define i32 @imp_null_check_load_fence1(i32* %x) { +; CHECK-LABEL: imp_null_check_load_fence1: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: je LBB16_1 +; CHECK-NEXT: ## %bb.2: ## %not_null +; CHECK-NEXT: ##MEMBARRIER +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: retq +; CHECK-NEXT: LBB16_1: ## %is_null +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: retq + +entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + +is_null: + ret i32 42 + +not_null: + fence acquire + %t = load i32, i32* %x + ret i32 %t +} + +;; TODO: We could handle this case as we can lift the fence into the +;; previous block before the conditional without changing behavior. +define i32 @imp_null_check_load_fence2(i32* %x) { +; CHECK-LABEL: imp_null_check_load_fence2: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: je LBB17_1 +; CHECK-NEXT: ## %bb.2: ## %not_null +; CHECK-NEXT: mfence +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: retq +; CHECK-NEXT: LBB17_1: ## %is_null +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: retq + +entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + +is_null: + ret i32 42 + +not_null: + fence seq_cst + %t = load i32, i32* %x + ret i32 %t +} + define void @imp_null_check_store(i32* %x) { ; CHECK-LABEL: imp_null_check_store: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: Ltmp14: -; CHECK-NEXT: movl $1, (%rdi) ## on-fault: LBB16_1 +; CHECK-NEXT: movl $1, (%rdi) ## on-fault: LBB18_1 ; CHECK-NEXT: ## %bb.2: ## %not_null ; CHECK-NEXT: retq -; CHECK-NEXT: LBB16_1: ## %is_null +; CHECK-NEXT: LBB18_1: ## %is_null ; CHECK-NEXT: retq entry: @@ -459,10 +515,10 @@ define void @imp_null_check_unordered_store(i32* %x) { ; CHECK-LABEL: imp_null_check_unordered_store: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: Ltmp15: -; CHECK-NEXT: movl $1, (%rdi) ## on-fault: LBB17_1 +; CHECK-NEXT: movl $1, (%rdi) ## on-fault: LBB19_1 ; CHECK-NEXT: ## %bb.2: ## %not_null ; CHECK-NEXT: retq -; CHECK-NEXT: LBB17_1: ## %is_null +; CHECK-NEXT: LBB19_1: ## %is_null ; CHECK-NEXT: retq entry: @@ -481,10 +537,10 @@ define i32 @imp_null_check_neg_gep_load(i32* %x) { ; CHECK-LABEL: imp_null_check_neg_gep_load: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: Ltmp16: -; CHECK-NEXT: movl -128(%rdi), %eax ## on-fault: LBB18_1 +; CHECK-NEXT: movl -128(%rdi), %eax ## on-fault: LBB20_1 ; CHECK-NEXT: ## %bb.2: ## %not_null ; CHECK-NEXT: retq -; CHECK-NEXT: LBB18_1: ## %is_null +; CHECK-NEXT: LBB20_1: ## %is_null ; CHECK-NEXT: movl $42, %eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/leaFixup32.mir b/llvm/test/CodeGen/X86/leaFixup32.mir index f614a4ad975e3f..23f3d761696687 100644 --- a/llvm/test/CodeGen/X86/leaFixup32.mir +++ b/llvm/test/CodeGen/X86/leaFixup32.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass x86-fixup-LEAs -mtriple=i386 -verify-machineinstrs -mcpu=corei7-avx -o - %s | FileCheck %s --- | ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll' @@ -40,13 +41,13 @@ define i32 @test1lea_ebp_32() { ret i32 0 } - + ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32 ; is chosen define i32 @test2addi32_32() { ret i32 0 } - + ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions ; where the base is rbp/r13/ebp register define i32 @test1mov1add_ebp_32() { @@ -64,7 +65,7 @@ define i32 @testleaadd_ebp_index2_32() { ret i32 0 } - + ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions define i32 @test_skip_opt_32() { ret i32 0 @@ -84,10 +85,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -104,9 +105,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp - ; CHECK: $eax = ADD32rr $eax, killed $ebp - ; CHECK: $eax = ADD32ri8 $eax, -5 - + + ; CHECK-LABEL: name: test2add_32 + ; CHECK: liveins: $eax, $ebp + ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags + ; CHECK: $eax = ADD32ri8 $eax, -5, implicit-def $eflags + ; CHECK: RETQ $eax $eax = LEA32r killed $eax, 1, killed $ebp, -5, $noreg RETQ $eax @@ -119,10 +123,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -139,9 +143,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp - ; CHECK: $ebp = ADD32rr $ebp, killed $eax - ; CHECK: $ebp = ADD32ri8 $ebp, -5 - + + ; CHECK-LABEL: name: test2add_ebp_32 + ; CHECK: liveins: $eax, $ebp + ; CHECK: $ebp = ADD32rr $ebp, $eax, implicit-def $eflags + ; CHECK: $ebp = ADD32ri8 $ebp, -5, implicit-def $eflags + ; CHECK: RETQ $ebp $ebp = LEA32r killed $ebp, 1, killed $eax, -5, $noreg RETQ $ebp @@ -154,10 +161,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -174,8 +181,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp - ; CHECK: $ebp = ADD32rr $ebp, $eax - + + ; CHECK-LABEL: name: test1add_ebp_32 + ; CHECK: liveins: $eax, $ebp + ; CHECK: $ebp = ADD32rr $ebp, $eax, implicit-def $eflags + ; CHECK: RETQ $ebp $ebp = LEA32r killed $ebp, 1, killed $eax, 0, $noreg RETQ $ebp @@ -188,11 +198,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebp' } - { reg: '$ebx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -209,9 +219,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp, $esi - ; CHECK: $ebx = LEA32r killed $eax, 1, killed $ebp, 0 - ; CHECK: $ebx = ADD32ri8 $ebx, -5 - + + ; CHECK-LABEL: name: testleaadd_32 + ; CHECK: liveins: $eax, $ebp, $esi + ; CHECK: $ebx = LEA32r killed $eax, 1, killed $ebp, 0, $noreg + ; CHECK: $ebx = ADD32ri8 $ebx, -5, implicit-def $eflags + ; CHECK: RETQ $ebx $ebx = LEA32r killed $eax, 1, killed $ebp, -5, $noreg RETQ $ebx @@ -224,11 +237,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebp' } - { reg: '$ebx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -245,9 +258,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp + + ; CHECK-LABEL: name: testleaadd_ebp_32 + ; CHECK: liveins: $eax, $ebp ; CHECK: $ebx = LEA32r killed $eax, 1, killed $ebp, 0, $noreg - ; CHECK: $ebx = ADD32ri8 $ebx, -5 - + ; CHECK: $ebx = ADD32ri8 $ebx, -5, implicit-def $eflags + ; CHECK: RETQ $ebx $ebx = LEA32r killed $ebp, 1, killed $eax, -5, $noreg RETQ $ebx @@ -260,11 +276,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebp' } - { reg: '$ebx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -281,8 +297,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp + + ; CHECK-LABEL: name: test1lea_ebp_32 + ; CHECK: liveins: $eax, $ebp ; CHECK: $ebx = LEA32r killed $eax, 1, killed $ebp, 0, $noreg - + ; CHECK: RETQ $ebx $ebx = LEA32r killed $ebp, 1, killed $eax, 0, $noreg RETQ $ebx @@ -295,10 +314,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -315,9 +334,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp - ; CHECK: $eax = ADD32rr $eax, killed $ebp - ; CHECK: $eax = ADD32ri $eax, 129 - + + ; CHECK-LABEL: name: test2addi32_32 + ; CHECK: liveins: $eax, $ebp + ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags + ; CHECK: $eax = ADD32ri $eax, 129, implicit-def $eflags + ; CHECK: RETQ $eax $eax = LEA32r killed $eax, 1, killed $ebp, 129, $noreg RETQ $eax @@ -330,11 +352,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$eax' } - { reg: '$ebx' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -351,9 +373,13 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp, $ebx - ; CHECK: $ebx = MOV32rr $ebp - ; CHECK: $ebx = ADD32rr $ebx, $ebp - + + ; CHECK-LABEL: name: test1mov1add_ebp_32 + ; CHECK: liveins: $eax, $ebp, $ebx + ; CHECK: $ebx = MOV32rr $ebp + ; CHECK: $ebx = ADD32rr $ebx, $ebp, implicit-def $eflags + ; CHECK: $ebx = LEA32r killed $ebp, 1, $ebp, 0, $noreg + ; CHECK: RETQ $ebx $ebx = LEA32r killed $ebp, 1, $ebp, 0, $noreg RETQ $ebx @@ -366,10 +392,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$ebx' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -386,9 +412,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp, $ebx + + ; CHECK-LABEL: name: testleaadd_ebp_index_32 + ; CHECK: liveins: $eax, $ebp, $ebx ; CHECK: $ebx = LEA32r $noreg, 1, $ebp, 5, $noreg - ; CHECK: $ebx = ADD32rr $ebx, $ebp - + ; CHECK: $ebx = ADD32rr $ebx, $ebp, implicit-def $eflags + ; CHECK: RETQ $ebx $ebx = LEA32r $ebp, 1, $ebp, 5, $noreg RETQ $ebx @@ -401,10 +430,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$ebx' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -421,9 +450,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp, $ebx + + ; CHECK-LABEL: name: testleaadd_ebp_index2_32 + ; CHECK: liveins: $eax, $ebp, $ebx ; CHECK: $ebx = LEA32r $noreg, 4, $ebp, 5, $noreg - ; CHECK: $ebx = ADD32rr $ebx, $ebp - + ; CHECK: $ebx = ADD32rr $ebx, $ebp, implicit-def $eflags + ; CHECK: RETQ $ebx $ebx = LEA32r $ebp, 4, $ebp, 5, $noreg RETQ $ebx @@ -436,10 +468,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$ebx' } - { reg: '$ebp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -456,8 +488,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp, $ebx + + ; CHECK-LABEL: name: test_skip_opt_32 + ; CHECK: liveins: $eax, $ebp, $ebx ; CHECK: $ebp = LEA32r killed $ebp, 4, killed $ebp, 0, $noreg - + ; CHECK: RETQ $ebp $ebp = LEA32r killed $ebp, 4, killed $ebp, 0, $noreg RETQ $ebp @@ -470,10 +505,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$ebp' } - { reg: '$eax' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -488,12 +523,22 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | + ; CHECK-LABEL: name: test_skip_eflags_32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $eax, $ebp, $ebx + ; CHECK: CMP32rr $eax, killed $ebx, implicit-def $eflags + ; CHECK: $ebx = LEA32r killed $eax, 4, killed $eax, 5, $noreg + ; CHECK: JCC_1 %bb.1, 4, implicit $eflags + ; CHECK: RETQ $ebx + ; CHECK: bb.1: + ; CHECK: liveins: $eax, $ebp, $ebx + ; CHECK: $ebp = LEA32r killed $ebx, 4, killed $ebx, 0, $noreg + ; CHECK: $ebp = ADD32ri8 $ebp, 5, implicit-def $eflags + ; CHECK: RETQ $ebp bb.0 (%ir-block.0): liveins: $eax, $ebp, $ebx - ; CHECK: $ebx = LEA32r killed $eax, 4, killed $eax, 5, $noreg - ; CHECK: $ebp = LEA32r killed $ebx, 4, killed $ebx, 0, $noreg - ; CHECK: $ebp = ADD32ri8 $ebp, 5 - + CMP32rr $eax, killed $ebx, implicit-def $eflags $ebx = LEA32r killed $eax, 4, killed $eax, 5, $noreg JCC_1 %bb.1, 4, implicit $eflags diff --git a/llvm/test/CodeGen/X86/leaFixup64.mir b/llvm/test/CodeGen/X86/leaFixup64.mir index 317c219992c742..77be582225694e 100644 --- a/llvm/test/CodeGen/X86/leaFixup64.mir +++ b/llvm/test/CodeGen/X86/leaFixup64.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -run-pass x86-fixup-LEAs -mtriple=x86_64-gnu-unknown -verify-machineinstrs -mcpu=corei7-avx -o - %s | FileCheck %s --- | ; ModuleID = 'lea-2.ll' @@ -78,13 +79,13 @@ define i32 @test8() { ret i32 0 } - + ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where ; ADD64ri32 is chosen define i32 @testleaaddi32_64_32() { ret i32 0 } - + ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions ; where the base is rbp/r13/ebp register define i32 @test1mov1add_rbp_64_32() { @@ -102,13 +103,13 @@ define i32 @testleaadd_rbp_index2_64_32() { ret i32 0 } - + ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32 ; is chosen define i32 @test2addi32_64() { ret i32 0 } - + ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions ; where the base is rbp/r13/ebp register define i32 @test1mov1add_rbp_64() { @@ -157,10 +158,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -177,9 +178,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $eax = LEA64_32r killed $rax, 1, killed $rbp, 0 - ; CHECK: $eax = ADD32ri8 $eax, -5 - + + ; CHECK-LABEL: name: testleaadd_64_32_1 + ; CHECK: liveins: $rax, $rbp + ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags, implicit $rax, implicit $rbp + ; CHECK: $eax = ADD32ri8 $eax, -5, implicit-def $eflags + ; CHECK: RETQ $eax $eax = LEA64_32r killed $rax, 1, killed $rbp, -5, $noreg RETQ $eax @@ -192,10 +196,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -212,9 +216,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $ebp = LEA64_32r killed $rax, 1, killed $rbp, 0 - ; CHECK: $ebp = ADD32ri8 $ebp, -5 - + + ; CHECK-LABEL: name: testleaadd_rbp_64_32_1 + ; CHECK: liveins: $rax, $rbp + ; CHECK: $ebp = ADD32rr $ebp, $eax, implicit-def $eflags, implicit $rbp, implicit $rax + ; CHECK: $ebp = ADD32ri8 $ebp, -5, implicit-def $eflags + ; CHECK: RETQ $ebp $ebp = LEA64_32r killed $rbp, 1, killed $rax, -5, $noreg RETQ $ebp @@ -227,10 +234,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -247,8 +254,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp + + ; CHECK-LABEL: name: test1lea_rbp_64_32_1 + ; CHECK: liveins: $rax, $rbp ; CHECK: $ebp = ADD32rr $ebp, $eax, implicit-def $eflags, implicit $rbp, implicit $rax - + ; CHECK: RETQ $ebp $ebp = LEA64_32r killed $rbp, 1, killed $rax, 0, $noreg RETQ $ebp @@ -261,10 +271,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -281,9 +291,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $rax = ADD64rr $rax, killed $rbp - ; CHECK: $rax = ADD64ri8 $rax, -5 - + + ; CHECK-LABEL: name: test2add_64 + ; CHECK: liveins: $rax, $rbp + ; CHECK: $rax = ADD64rr $rax, $rbp, implicit-def $eflags + ; CHECK: $rax = ADD64ri8 $rax, -5, implicit-def $eflags + ; CHECK: RETQ $eax $rax = LEA64r killed $rax, 1, killed $rbp, -5, $noreg RETQ $eax @@ -296,10 +309,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -316,9 +329,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $rbp = ADD64rr $rbp, killed $rax - ; CHECK: $rbp = ADD64ri8 $rbp, -5 - + + ; CHECK-LABEL: name: test2add_rbp_64 + ; CHECK: liveins: $rax, $rbp + ; CHECK: $rbp = ADD64rr $rbp, $rax, implicit-def $eflags + ; CHECK: $rbp = ADD64ri8 $rbp, -5, implicit-def $eflags + ; CHECK: RETQ $ebp $rbp = LEA64r killed $rbp, 1, killed $rax, -5, $noreg RETQ $ebp @@ -331,10 +347,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -351,8 +367,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $rbp = ADD64rr $rbp, $rax - + + ; CHECK-LABEL: name: test1add_rbp_64 + ; CHECK: liveins: $rax, $rbp + ; CHECK: $rbp = ADD64rr $rbp, $rax, implicit-def $eflags + ; CHECK: RETQ $ebp $rbp = LEA64r killed $rbp, 1, killed $rax, 0, $noreg RETQ $ebp @@ -365,11 +384,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } - { reg: '$rbx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -386,9 +405,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp + + ; CHECK-LABEL: name: testleaadd_64_32 + ; CHECK: liveins: $rax, $rbp ; CHECK: $ebx = LEA64_32r killed $rax, 1, killed $rbp, 0, $noreg - ; CHECK: $ebx = ADD32ri8 $ebx, -5 - + ; CHECK: $ebx = ADD32ri8 $ebx, -5, implicit-def $eflags + ; CHECK: RETQ $ebx $ebx = LEA64_32r killed $rax, 1, killed $rbp, -5, $noreg RETQ $ebx @@ -401,11 +423,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } - { reg: '$rbx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -422,9 +444,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp + + ; CHECK-LABEL: name: testleaadd_rbp_64_32 + ; CHECK: liveins: $rax, $rbp ; CHECK: $ebx = LEA64_32r killed $rax, 1, killed $rbp, 0, $noreg - ; CHECK: $ebx = ADD32ri8 $ebx, -5 - + ; CHECK: $ebx = ADD32ri8 $ebx, -5, implicit-def $eflags + ; CHECK: RETQ $ebx $ebx = LEA64_32r killed $rbp, 1, killed $rax, -5, $noreg RETQ $ebx @@ -437,11 +462,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } - { reg: '$rbx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -458,8 +483,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp + + ; CHECK-LABEL: name: test1lea_rbp_64_32 + ; CHECK: liveins: $rax, $rbp ; CHECK: $ebx = LEA64_32r killed $rax, 1, killed $rbp, 0, $noreg - + ; CHECK: RETQ $ebx $ebx = LEA64_32r killed $rbp, 1, killed $rax, 0, $noreg RETQ $ebx @@ -472,11 +500,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } - { reg: '$rbx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -493,9 +521,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp + + ; CHECK-LABEL: name: testleaadd_64 + ; CHECK: liveins: $rax, $rbp ; CHECK: $rbx = LEA64r killed $rax, 1, killed $rbp, 0, $noreg - ; CHECK: $rbx = ADD64ri8 $rbx, -5 - + ; CHECK: $rbx = ADD64ri8 $rbx, -5, implicit-def $eflags + ; CHECK: RETQ $ebx $rbx = LEA64r killed $rax, 1, killed $rbp, -5, $noreg RETQ $ebx @@ -508,11 +539,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } - { reg: '$rbx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -529,9 +560,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp + + ; CHECK-LABEL: name: testleaadd_rbp_64 + ; CHECK: liveins: $rax, $rbp ; CHECK: $rbx = LEA64r killed $rax, 1, killed $rbp, 0, $noreg - ; CHECK: $rbx = ADD64ri8 $rbx, -5 - + ; CHECK: $rbx = ADD64ri8 $rbx, -5, implicit-def $eflags + ; CHECK: RETQ $ebx $rbx = LEA64r killed $rbp, 1, killed $rax, -5, $noreg RETQ $ebx @@ -544,11 +578,11 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } - { reg: '$rbx' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -565,8 +599,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp + + ; CHECK-LABEL: name: test1lea_rbp_64 + ; CHECK: liveins: $rax, $rbp ; CHECK: $rbx = LEA64r killed $rax, 1, killed $rbp, 0, $noreg - + ; CHECK: RETQ $ebx $rbx = LEA64r killed $rbp, 1, killed $rax, 0, $noreg RETQ $ebx @@ -579,10 +616,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rdi' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -599,8 +636,13 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rdi, $rbp - ; CHECK: $r12 = LEA64r $noreg, 2, killed $r13, 5, $noreg - ; CHECK: $r12 = ADD64rr $r12, killed $rbp + ; CHECK-LABEL: name: test8 + ; CHECK: liveins: $rdi, $rbp + ; CHECK: $rbp = KILL $rbp, implicit-def $rbp + ; CHECK: $r13 = KILL $rdi, implicit-def $r13 + ; CHECK: $r12 = LEA64r $noreg, 2, killed $r13, 5, $noreg + ; CHECK: $r12 = ADD64rr $r12, killed $rbp, implicit-def $eflags + ; CHECK: RETQ $r12 $rbp = KILL $rbp, implicit-def $rbp $r13 = KILL $rdi, implicit-def $r13 $r12 = LEA64r killed $rbp, 2, killed $r13, 5, $noreg @@ -615,10 +657,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -635,9 +677,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $eax = LEA64_32r killed $rax, 1, killed $rbp, 0 - ; CHECK: $eax = ADD32ri $eax, 129 - + + ; CHECK-LABEL: name: testleaaddi32_64_32 + ; CHECK: liveins: $rax, $rbp + ; CHECK: $eax = ADD32rr $eax, $ebp, implicit-def $eflags, implicit $rax, implicit $rbp + ; CHECK: $eax = ADD32ri $eax, 129, implicit-def $eflags + ; CHECK: RETQ $eax $eax = LEA64_32r killed $rax, 1, killed $rbp, 129, $noreg RETQ $eax @@ -650,10 +695,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -670,8 +715,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx - ; CHECK: $ebx = LEA64_32r killed $rbp, 1, killed $rbp, 0, $noreg + ; CHECK-LABEL: name: test1mov1add_rbp_64_32 + ; CHECK: liveins: $rax, $rbp, $rbx + ; CHECK: $ebx = LEA64_32r killed $rbp, 1, killed $rbp, 0, $noreg + ; CHECK: RETQ $ebx $ebx = LEA64_32r killed $rbp, 1, killed $rbp, 0, $noreg RETQ $ebx @@ -684,10 +732,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbx' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -704,8 +752,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx + + ; CHECK-LABEL: name: testleaadd_rbp_index_64_32 + ; CHECK: liveins: $rax, $rbp, $rbx ; CHECK: $ebx = LEA64_32r killed $rbp, 1, killed $rbp, 5, $noreg - + ; CHECK: RETQ $ebx $ebx = LEA64_32r killed $rbp, 1, killed $rbp, 5, $noreg RETQ $ebx @@ -718,10 +769,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbx' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -738,8 +789,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $eax, $ebp, $ebx + + ; CHECK-LABEL: name: testleaadd_rbp_index2_64_32 + ; CHECK: liveins: $eax, $ebp, $ebx ; CHECK: $ebx = LEA64_32r killed $rbp, 4, killed $rbp, 5, $noreg - + ; CHECK: RETQ $ebx $ebx = LEA64_32r killed $rbp, 4, killed $rbp, 5, $noreg RETQ $ebx @@ -752,10 +806,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -772,9 +826,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp - ; CHECK: $rax = ADD64rr $rax, killed $rbp - ; CHECK: $rax = ADD64ri32 $rax, 129 - + + ; CHECK-LABEL: name: test2addi32_64 + ; CHECK: liveins: $rax, $rbp + ; CHECK: $rax = ADD64rr $rax, $rbp, implicit-def $eflags + ; CHECK: $rax = ADD64ri32 $rax, 129, implicit-def $eflags + ; CHECK: RETQ $eax $rax = LEA64r killed $rax, 1, killed $rbp, 129, $noreg RETQ $eax @@ -787,10 +844,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rax' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -807,9 +864,13 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx + + ; CHECK-LABEL: name: test1mov1add_rbp_64 + ; CHECK: liveins: $rax, $rbp, $rbx ; CHECK: $rbx = MOV64rr $rbp - ; CHECK: $rbx = ADD64rr $rbx, $rbp - + ; CHECK: $rbx = ADD64rr $rbx, $rbp, implicit-def $eflags + ; CHECK: $rbx = LEA64r killed $rbp, 1, $rbp, 0, $noreg + ; CHECK: RETQ $ebx $rbx = LEA64r killed $rbp, 1, $rbp, 0, $noreg RETQ $ebx @@ -822,10 +883,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbx' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -842,9 +903,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx - ; CHECK: $rbx = LEA64r $noreg, 1, $rbp, 5, $noreg - ; CHECK: $rbx = ADD64rr $rbx, $rbp - + + ; CHECK-LABEL: name: testleaadd_rbp_index_64 + ; CHECK: liveins: $rax, $rbp, $rbx + ; CHECK: $rbx = LEA64r $noreg, 1, $rbp, 5, $noreg + ; CHECK: $rbx = ADD64rr $rbx, $rbp, implicit-def $eflags + ; CHECK: RETQ $ebx $rbx = LEA64r $rbp, 1, $rbp, 5, $noreg RETQ $ebx @@ -857,10 +921,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbx' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -877,9 +941,12 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx + + ; CHECK-LABEL: name: testleaadd_rbp_index2_64 + ; CHECK: liveins: $rax, $rbp, $rbx ; CHECK: $rbx = LEA64r $noreg, 4, $rbp, 5, $noreg - ; CHECK: $rbx = ADD64rr $rbx, $rbp - + ; CHECK: $rbx = ADD64rr $rbx, $rbp, implicit-def $eflags + ; CHECK: RETQ $ebx $rbx = LEA64r $rbp, 4, $rbp, 5, $noreg RETQ $ebx @@ -892,10 +959,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbx' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -912,8 +979,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx + + ; CHECK-LABEL: name: test_skip_opt_64 + ; CHECK: liveins: $rax, $rbp, $rbx ; CHECK: $rbp = LEA64r killed $rbp, 4, killed $rbp, 0, $noreg - + ; CHECK: RETQ $ebp $rbp = LEA64r killed $rbp, 4, killed $rbp, 0, $noreg RETQ $ebp @@ -926,10 +996,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbp' } - { reg: '$rax' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -944,12 +1014,22 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | + ; CHECK-LABEL: name: test_skip_eflags_64 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $rax, $rbp, $rbx + ; CHECK: CMP64rr $rax, killed $rbx, implicit-def $eflags + ; CHECK: $rbx = LEA64r killed $rax, 4, killed $rax, 5, $noreg + ; CHECK: JCC_1 %bb.1, 4, implicit $eflags + ; CHECK: RETQ $ebx + ; CHECK: bb.1: + ; CHECK: liveins: $rax, $rbp, $rbx + ; CHECK: $rbp = LEA64r killed $rbx, 4, killed $rbx, 0, $noreg + ; CHECK: $rbp = ADD64ri8 $rbp, 5, implicit-def $eflags + ; CHECK: RETQ $ebp bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx - ; CHECK: $rbx = LEA64r killed $rax, 4, killed $rax, 5, $noreg - ; CHECK: $rbp = LEA64r killed $rbx, 4, killed $rbx, 0, $noreg - ; CHECK: $rbp = ADD64ri8 $rbp, 5 - + CMP64rr $rax, killed $rbx, implicit-def $eflags $rbx = LEA64r killed $rax, 4, killed $rax, 5, $noreg JCC_1 %bb.1, 4, implicit $eflags @@ -968,10 +1048,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbx' } - { reg: '$rbp' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -988,8 +1068,11 @@ frameInfo: body: | bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx + + ; CHECK-LABEL: name: test_skip_opt_64_32 + ; CHECK: liveins: $rax, $rbp, $rbx ; CHECK: $ebp = LEA64_32r killed $rbp, 4, killed $rbp, 0, $noreg - + ; CHECK: RETQ $ebp $ebp = LEA64_32r killed $rbp, 4, killed $rbp, 0, $noreg RETQ $ebp @@ -1002,10 +1085,10 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true -liveins: +liveins: - { reg: '$rbp' } - { reg: '$rax' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -1020,12 +1103,22 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | + ; CHECK-LABEL: name: test_skip_eflags_64_32 + ; CHECK: bb.0 (%ir-block.0): + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $rax, $rbp, $rbx + ; CHECK: CMP64rr $rax, killed $rbx, implicit-def $eflags + ; CHECK: $ebx = LEA64_32r killed $rax, 4, killed $rax, 5, $noreg + ; CHECK: JCC_1 %bb.1, 4, implicit $eflags + ; CHECK: RETQ $ebx + ; CHECK: bb.1: + ; CHECK: liveins: $rax, $rbp, $rbx + ; CHECK: $ebp = LEA64_32r killed $rbx, 4, killed $rbx, 0, $noreg + ; CHECK: $ebp = ADD32ri8 $ebp, 5, implicit-def $eflags + ; CHECK: RETQ $ebp bb.0 (%ir-block.0): liveins: $rax, $rbp, $rbx - ; CHECK: $ebx = LEA64_32r killed $rax, 4, killed $rax, 5, $noreg - ; CHECK: $ebp = LEA64_32r killed $rbx, 4, killed $rbx, 0, $noreg - ; CHECK: $ebp = ADD32ri8 $ebp, 5 - + CMP64rr $rax, killed $rbx, implicit-def $eflags $ebx = LEA64_32r killed $rax, 4, killed $rax, 5, $noreg JCC_1 %bb.1, 4, implicit $eflags diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 8c7232cf950e38..164826d8715592 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4913,24 +4913,30 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; AVX512F-LABEL: widen_masked_store: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: kmovw %edx, %k0 ; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: kmovw %esi, %k1 -; AVX512F-NEXT: kxorw %k0, %k0, %k2 -; AVX512F-NEXT: kshiftrw $1, %k2, %k2 -; AVX512F-NEXT: kshiftlw $1, %k2, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k2 -; AVX512F-NEXT: kxorw %k0, %k2, %k0 +; AVX512F-NEXT: kmovw %esi, %k0 +; AVX512F-NEXT: kxorw %k0, %k0, %k1 +; AVX512F-NEXT: kshiftrw $1, %k1, %k1 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-NEXT: kshiftlw $2, %k1, %k1 ; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: kshiftlw $15, %k2, %k2 +; AVX512F-NEXT: kshiftrw $14, %k2, %k2 +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-NEXT: kshiftlw $3, %k1, %k1 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kxorw %k0, %k1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: kxorw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 -; AVX512F-NEXT: kxorw %k1, %k0, %k0 +; AVX512F-NEXT: korw %k0, %k1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} @@ -4939,48 +4945,60 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; ; AVX512VLDQ-LABEL: widen_masked_store: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: kmovw %edx, %k0 -; AVX512VLDQ-NEXT: kmovw %esi, %k1 -; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLDQ-NEXT: kshiftrb $1, %k2, %k2 -; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2 -; AVX512VLDQ-NEXT: korb %k1, %k2, %k1 -; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k2 -; AVX512VLDQ-NEXT: kxorb %k0, %k2, %k0 +; AVX512VLDQ-NEXT: kmovw %esi, %k0 ; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 -; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 -; AVX512VLDQ-NEXT: kxorb %k0, %k1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $1, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k0, %k1, %k0 ; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %ecx, %k2 -; AVX512VLDQ-NEXT: kxorb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %edx, %k2 +; AVX512VLDQ-NEXT: kshiftlb $7, %k2, %k2 +; AVX512VLDQ-NEXT: kshiftrb $6, %k2, %k2 +; AVX512VLDQ-NEXT: korb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftlb $3, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 +; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %ecx, %k1 ; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLDQ-NEXT: korw %k0, %k1, %k1 ; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: kmovd %edx, %k0 ; AVX512VLBW-NEXT: andl $1, %esi -; AVX512VLBW-NEXT: kmovw %esi, %k1 -; AVX512VLBW-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLBW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512VLBW-NEXT: korw %k1, %k2, %k1 -; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k2 -; AVX512VLBW-NEXT: kxorw %k0, %k2, %k0 +; AVX512VLBW-NEXT: kmovw %esi, %k0 +; AVX512VLBW-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k0 +; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $2, %k1, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %edx, %k2 +; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512VLBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512VLBW-NEXT: korw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512VLBW-NEXT: kxorw %k0, %k1, %k0 -; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLBW-NEXT: kmovd %ecx, %k2 -; AVX512VLBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %ecx, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512VLBW-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: retq call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 777d4d14e4e40b..83bf33c4f7f1e7 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -1432,9 +1432,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB2_1 @@ -1719,10 +1717,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -1744,10 +1740,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2027,10 +2021,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2071,10 +2063,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2361,10 +2351,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2405,10 +2393,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 254f0cda48fc00..00fdbf26e70dbc 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -1215,8 +1215,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB2_1 @@ -1465,8 +1464,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -1487,8 +1485,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -1734,8 +1731,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -1776,8 +1772,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2028,8 +2023,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2070,8 +2064,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index 8a7c7847fe0c39..0077df867db4d5 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -5,6 +5,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512BW ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -1540,6 +1542,24 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; X64-AVX512F-LABEL: length64_eq: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 +; X64-AVX512F-NEXT: kortestw %k0, %k0 +; X64-AVX512F-NEXT: setae %al +; X64-AVX512F-NEXT: vzeroupper +; X64-AVX512F-NEXT: retq +; +; X64-AVX512BW-LABEL: length64_eq: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k0 +; X64-AVX512BW-NEXT: kortestq %k0, %k0 +; X64-AVX512BW-NEXT: setae %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -1592,6 +1612,24 @@ define i1 @length64_eq_const(i8* %X) nounwind { ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; X64-AVX512F-LABEL: length64_eq_const: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512F-NEXT: vpcmpeqd {{.*}}(%rip), %zmm0, %k0 +; X64-AVX512F-NEXT: kortestw %k0, %k0 +; X64-AVX512F-NEXT: setb %al +; X64-AVX512F-NEXT: vzeroupper +; X64-AVX512F-NEXT: retq +; +; X64-AVX512BW-LABEL: length64_eq_const: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; X64-AVX512BW-NEXT: vpcmpeqb {{.*}}(%rip), %zmm0, %k0 +; X64-AVX512BW-NEXT: kortestq %k0, %k0 +; X64-AVX512BW-NEXT: setb %al +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index b69525deb41ef7..e3c66e83c83f6f 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -797,14 +797,12 @@ define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %x) nounwind "min-legal-vector- ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 -; CHECK-NEXT: vpmovqd %ymm2, %xmm2 -; CHECK-NEXT: vpmovqd %ymm3, %xmm3 -; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; CHECK-NEXT: vpmovdb %ymm2, %xmm2 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vpmovdb %ymm0, %xmm0 +; CHECK-NEXT: vpmovqb %ymm3, %xmm3 +; CHECK-NEXT: vpmovqb %ymm2, %xmm2 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -828,6 +826,21 @@ define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector- ret <16 x i8> %b } +define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { +; CHECK-LABEL: trunc_v8i64_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = load <8 x i64>, <8 x i64>* %x + %b = trunc <8 x i64> %a to <8 x i8> + ret <8 x i8> %b +} + define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v8i64_v8i16: ; CHECK: # %bb.0: @@ -982,3 +995,87 @@ define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal store <16 x i64> %a, <16 x i64>* %y ret void } + +define void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v8i16_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: kshiftrb $4, %k1, %k2 +; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <8 x i64>, <8 x i64>* %p + %y = load <8 x i64>, <8 x i64>* %q + %a = icmp eq <8 x i16> %s, %t + %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y + store <8 x i64> %b, <8 x i64>* %r + ret void +} + +define void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v8i32_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; CHECK-NEXT: kshiftrb $4, %k1, %k2 +; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <8 x i64>, <8 x i64>* %p + %y = load <8 x i64>, <8 x i64>* %q + %a = icmp eq <8 x i32> %s, %t + %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y + store <8 x i64> %b, <8 x i64>* %r + ret void +} + +define void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v16i8_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: kshiftrw $8, %k1, %k2 +; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <16 x i32>, <16 x i32>* %p + %y = load <16 x i32>, <16 x i32>* %q + %a = icmp eq <16 x i8> %s, %t + %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y + store <16 x i32> %b, <16 x i32>* %r + ret void +} + +define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v16i16_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; CHECK-NEXT: kshiftrw $8, %k1, %k2 +; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <16 x i32>, <16 x i32>* %p + %y = load <16 x i32>, <16 x i32>* %q + %a = icmp eq <16 x i16> %s, %t + %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y + store <16 x i32> %b, <16 x i32>* %r + ret void +} diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll index c0fa42e3c2bad4..e3bd9d9e6ed126 100644 --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -356,18 +356,18 @@ define <32 x i8> @packsswb_icmp_zero_trunc_256(<16 x i16> %a0) { ; ; AVX1-LABEL: packsswb_icmp_zero_trunc_256: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = zero,zero,ymm0[0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: ret{{[l|q]}} ; diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll index 056682bb2750b2..f8ead6352f1d35 100644 --- a/llvm/test/CodeGen/X86/pr34177.ll +++ b/llvm/test/CodeGen/X86/pr34177.ll @@ -6,45 +6,88 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define void @test(<4 x i64> %a, <4 x x86_fp80> %b, <8 x x86_fp80>* %c) local_unnamed_addr { -; CHECK-LABEL: test: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovq %xmm0, %rax -; CHECK-NEXT: vpextrq $1, %xmm0, %rcx -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovq %xmm0, %rdx -; CHECK-NEXT: vpextrq $1, %xmm0, %rsi -; CHECK-NEXT: cmpq $3, %rsi -; CHECK-NEXT: fld1 -; CHECK-NEXT: fldz -; CHECK-NEXT: fld %st(0) -; CHECK-NEXT: fcmove %st(2), %st -; CHECK-NEXT: cmpq $2, %rdx -; CHECK-NEXT: fld %st(1) -; CHECK-NEXT: fcmove %st(3), %st -; CHECK-NEXT: cmpq $1, %rcx -; CHECK-NEXT: fld %st(2) -; CHECK-NEXT: fcmove %st(4), %st -; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: fxch %st(3) -; CHECK-NEXT: fcmove %st(4), %st -; CHECK-NEXT: fstp %st(4) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 70(%rdi) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 50(%rdi) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 30(%rdi) -; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) -; CHECK-NEXT: fstpt 10(%rdi) -; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt 60(%rdi) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt 40(%rdi) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt 20(%rdi) -; CHECK-NEXT: fadd %st, %st(0) -; CHECK-NEXT: fstpt (%rdi) +; AVX512F-LABEL: test: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rdx +; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512F-NEXT: cmpq $3, %rsi +; AVX512F-NEXT: fld1 +; AVX512F-NEXT: fldz +; AVX512F-NEXT: fld %st(0) +; AVX512F-NEXT: fcmove %st(2), %st +; AVX512F-NEXT: cmpq $2, %rdx +; AVX512F-NEXT: fld %st(1) +; AVX512F-NEXT: fcmove %st(3), %st +; AVX512F-NEXT: cmpq $1, %rcx +; AVX512F-NEXT: fld %st(2) +; AVX512F-NEXT: fcmove %st(4), %st +; AVX512F-NEXT: testq %rax, %rax +; AVX512F-NEXT: fxch %st(3) +; AVX512F-NEXT: fcmove %st(4), %st +; AVX512F-NEXT: fstp %st(4) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 70(%rdi) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 50(%rdi) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 30(%rdi) +; AVX512F-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512F-NEXT: fstpt 10(%rdi) +; AVX512F-NEXT: fxch %st(1) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt 60(%rdi) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt 40(%rdi) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt 20(%rdi) +; AVX512F-NEXT: fadd %st, %st(0) +; AVX512F-NEXT: fstpt (%rdi) +; +; AVX512VL-LABEL: test: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqq {{.*}}(%rip), %ymm0, %k0 +; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 +; AVX512VL-NEXT: kshiftrb $1, %k0, %k2 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fld1 +; AVX512VL-NEXT: fldz +; AVX512VL-NEXT: fld %st(0) +; AVX512VL-NEXT: fcmovne %st(2), %st +; AVX512VL-NEXT: kshiftrb $1, %k1, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fld %st(1) +; AVX512VL-NEXT: fcmovne %st(3), %st +; AVX512VL-NEXT: kmovd %k1, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fld %st(2) +; AVX512VL-NEXT: fcmovne %st(4), %st +; AVX512VL-NEXT: kmovd %k2, %eax +; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: fxch %st(3) +; AVX512VL-NEXT: fcmovne %st(4), %st +; AVX512VL-NEXT: fstp %st(4) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 70(%rdi) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 50(%rdi) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 30(%rdi) +; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX512VL-NEXT: fstpt 10(%rdi) +; AVX512VL-NEXT: fxch %st(1) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt (%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 60(%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 40(%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 20(%rdi) %1 = icmp eq <4 x i64> , %a %2 = select <4 x i1> %1, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer %3 = fadd <4 x x86_fp80> %2, %2 diff --git a/llvm/test/CodeGen/X86/pr43575.ll b/llvm/test/CodeGen/X86/pr43575.ll new file mode 100644 index 00000000000000..00c70c69709725 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr43575.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-apple-macosx10.14.0 -O0 | FileCheck %s + +define void @exit(i32 %status) +; CHECK-LABEL: exit: +; CHECK: ## %bb.0: +; CHECK: ## InlineAsm Start +; CHECK: movq $60, %rax +; CHECK: syscall +; CHECK: ## InlineAsm End +; CHECK: retq +{ + call void asm sideeffect inteldialect "mov rax, 60; syscall", ""() + ret void +} diff --git a/llvm/test/CodeGen/X86/ptr-rotate.ll b/llvm/test/CodeGen/X86/ptr-rotate.ll index fbd13b5036447f..c2365c607d069f 100644 --- a/llvm/test/CodeGen/X86/ptr-rotate.ll +++ b/llvm/test/CodeGen/X86/ptr-rotate.ll @@ -1,11 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=i386-apple-darwin -mcpu=corei7 -o - < %s | FileCheck %s define i32 @func(i8* %A) nounwind readnone { +; CHECK-LABEL: func: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: roll $27, %eax +; CHECK-NEXT: retl entry: %tmp = ptrtoint i8* %A to i32 %shr = lshr i32 %tmp, 5 %shl = shl i32 %tmp, 27 %or = or i32 %shr, %shl -; CHECK: roll $27 ret i32 %or } diff --git a/llvm/test/CodeGen/X86/select-1-or-neg1.ll b/llvm/test/CodeGen/X86/select-1-or-neg1.ll index b0244fe7d992c9..c85cc08f886b35 100644 --- a/llvm/test/CodeGen/X86/select-1-or-neg1.ll +++ b/llvm/test/CodeGen/X86/select-1-or-neg1.ll @@ -19,8 +19,8 @@ define i32 @PR28968(i32 %x) { ; SLOWLEA3-NEXT: xorl %eax, %eax ; SLOWLEA3-NEXT: cmpl $1, %edi ; SLOWLEA3-NEXT: sete %al -; SLOWLEA3-NEXT: leal (%rax,%rax), %eax -; SLOWLEA3-NEXT: addl $-1, %eax +; SLOWLEA3-NEXT: addl %eax, %eax +; SLOWLEA3-NEXT: decl %eax ; SLOWLEA3-NEXT: retq %cmp = icmp eq i32 %x, 1 %sel = select i1 %cmp, i32 1, i32 -1 diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll index d8176e488c1c2a..58baea95fcd0b2 100644 --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -319,14 +319,23 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_i512: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setae %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp ne i512 %bcx, %bcy @@ -464,14 +473,23 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: eq_i512: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: eq_i512: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %bcx = bitcast <8 x i64> %x to i512 %bcy = bitcast <8 x i64> %y to i512 %cmp = icmp eq i512 %bcx, %bcy @@ -804,17 +822,29 @@ define i32 @ne_i512_pair(i512* %a, i512* %b) { ; NO512-NEXT: setne %al ; NO512-NEXT: retq ; -; AVX512-LABEL: ne_i512_pair: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 -; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 -; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setae %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setae %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0 @@ -886,17 +916,29 @@ define i32 @eq_i512_pair(i512* %a, i512* %b) { ; NO512-NEXT: sete %al ; NO512-NEXT: retq ; -; AVX512-LABEL: eq_i512_pair: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 -; AVX512-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 -; AVX512-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: eq_i512_pair: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512F-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd 64(%rsi), %zmm1, %k0 {%k1} +; AVX512F-NEXT: xorl %eax, %eax +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: eq_i512_pair: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vpcmpeqb (%rsi), %zmm0, %k1 +; AVX512BW-NEXT: vpcmpeqb 64(%rsi), %zmm1, %k0 {%k1} +; AVX512BW-NEXT: xorl %eax, %eax +; AVX512BW-NEXT: kortestq %k0, %k0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %a0 = load i512, i512* %a %b0 = load i512, i512* %b %xor1 = xor i512 %a0, %b0 diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index ebb3b623c46775..720cabee9122c8 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -549,20 +549,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1 +; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: @@ -585,20 +578,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: diff --git a/llvm/test/CodeGen/X86/srem-lkk.ll b/llvm/test/CodeGen/X86/srem-lkk.ll new file mode 100644 index 00000000000000..ae30ae4463a93f --- /dev/null +++ b/llvm/test/CodeGen/X86/srem-lkk.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +define i32 @fold_srem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_srem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-1401515643, %rax, %rcx # imm = 0xAC769185 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: sarl $6, %ecx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $95, %ecx, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_srem_positive_even(i32 %x) { +; CHECK-LABEL: fold_srem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $1037275121, %rax, %rcx # imm = 0x3DD38FF1 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: shrq $63, %rdx +; CHECK-NEXT: sarq $40, %rcx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $1060, %ecx, %ecx # imm = 0x424 +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 1060 + ret i32 %1 +} + + +define i32 @fold_srem_negative_odd(i32 %x) { +; CHECK-LABEL: fold_srem_negative_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-1520762971, %rax, %rcx # imm = 0xA55AFFA5 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: shrq $63, %rdx +; CHECK-NEXT: sarq $40, %rcx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $-723, %ecx, %ecx # imm = 0xFD2D +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, -723 + ret i32 %1 +} + + +define i32 @fold_srem_negative_even(i32 %x) { +; CHECK-LABEL: fold_srem_negative_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-47844377, %rax, %rcx # imm = 0xFD25F3E7 +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: shrq $63, %rdx +; CHECK-NEXT: sarq $40, %rcx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $-22981, %ecx, %ecx # imm = 0xA63B +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, -22981 + ret i32 %1 +} + + +; Don't fold if we can combine srem with sdiv. +define i32 @combine_srem_sdiv(i32 %x) { +; CHECK-LABEL: combine_srem_sdiv: +; CHECK: # %bb.0: +; CHECK-NEXT: movslq %edi, %rax +; CHECK-NEXT: imulq $-1401515643, %rax, %rcx # imm = 0xAC769185 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl %ecx, %edx +; CHECK-NEXT: shrl $31, %edx +; CHECK-NEXT: sarl $6, %ecx +; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: imull $95, %ecx, %edx +; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 95 + %2 = sdiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_srem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_srem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: leal 63(%rax), %ecx +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnsl %edi, %ecx +; CHECK-NEXT: andl $-64, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq + %1 = srem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_srem_one(i32 %x) { +; CHECK-LABEL: dont_fold_srem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %1 = srem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^31. +define i32 @dont_fold_srem_i32_smax(i32 %x) { +; CHECK-LABEL: dont_fold_srem_i32_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: leal 2147483647(%rdi), %eax +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: cmovnsl %edi, %eax +; CHECK-NEXT: andl $-2147483648, %eax # imm = 0x80000000 +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq + %1 = srem i32 %x, 2147483648 + ret i32 %1 +} + +; Don't fold i64 srem +define i64 @dont_fold_srem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_srem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: imulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: shrq $63, %rax +; CHECK-NEXT: sarq $5, %rdx +; CHECK-NEXT: addq %rax, %rdx +; CHECK-NEXT: imulq $98, %rdx, %rax +; CHECK-NEXT: subq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %1 = srem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll new file mode 100644 index 00000000000000..19f79327476871 --- /dev/null +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -0,0 +1,556 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { +; SSE-LABEL: fold_srem_vec_1: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: subl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $9, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 +; SSE-NEXT: shrl $16, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: movzwl %dx, %edx +; SSE-NEXT: movswl %dx, %esi +; SSE-NEXT: shrl $15, %edx +; SSE-NEXT: sarl $6, %esi +; SSE-NEXT: addl %edx, %esi +; SSE-NEXT: imull $95, %esi, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pextrw $1, %xmm0, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF +; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: shrl $31, %esi +; SSE-NEXT: sarl $21, %edx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: imull $-124, %edx, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 +; SSE-NEXT: movl %edx, %esi +; SSE-NEXT: shrl $31, %esi +; SSE-NEXT: sarl $18, %edx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: imull $98, %edx, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_srem_vec_1: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: subl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $9, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 +; AVX-NEXT: shrl $16, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: movswl %dx, %esi +; AVX-NEXT: shrl $15, %edx +; AVX-NEXT: sarl $6, %esi +; AVX-NEXT: addl %edx, %esi +; AVX-NEXT: imull $95, %esi, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpextrw $1, %xmm0, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: shrl $31, %esi +; AVX-NEXT: sarl $21, %edx +; AVX-NEXT: addl %esi, %edx +; AVX-NEXT: imull $-124, %edx, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 +; AVX-NEXT: movl %edx, %esi +; AVX-NEXT: shrl $31, %esi +; AVX-NEXT: sarl $18, %edx +; AVX-NEXT: addl %esi, %edx +; AVX-NEXT: imull $98, %edx, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 +; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { +; SSE-LABEL: fold_srem_vec_2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: psraw $6, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_srem_vec_2: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine srem with sdiv. +define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { +; SSE-LABEL: combine_srem_sdiv: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhw %xmm0, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: psraw $6, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_srem_sdiv: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + %2 = sdiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_fold_srem_power_of_two: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: leal 31(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-32, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: leal 63(%rcx), %edx +; SSE-NEXT: testw %cx, %cx +; SSE-NEXT: cmovnsl %ecx, %edx +; SSE-NEXT: andl $-64, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: leal 7(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-8, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $6, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: imull $95, %edx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_srem_power_of_two: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: leal 31(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-32, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: leal 63(%rcx), %edx +; AVX-NEXT: testw %cx, %cx +; AVX-NEXT: cmovnsl %ecx, %edx +; AVX-NEXT: andl $-64, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: leal 7(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-8, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $6, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: imull $95, %edx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { +; SSE-LABEL: dont_fold_srem_one: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $4, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: leal (%rdx,%rdx,2), %ecx +; SSE-NEXT: shll $3, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B +; SSE-NEXT: movl %ecx, %esi +; SSE-NEXT: shrl $31, %esi +; SSE-NEXT: sarl $23, %ecx +; SSE-NEXT: addl %esi, %ecx +; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; SSE-NEXT: movl %ecx, %edx +; SSE-NEXT: shrl $31, %edx +; SSE-NEXT: sarl $26, %ecx +; SSE-NEXT: addl %edx, %ecx +; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_srem_one: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $4, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX-NEXT: shll $3, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: addl %eax, %edx +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B +; AVX-NEXT: movl %ecx, %esi +; AVX-NEXT: shrl $31, %esi +; AVX-NEXT: sarl $23, %ecx +; AVX-NEXT: addl %esi, %ecx +; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: shrl $31, %edx +; AVX-NEXT: sarl $26, %ecx +; AVX-NEXT: addl %edx, %ecx +; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^15. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_i16_smax: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: movswl %cx, %edx +; SSE-NEXT: shrl $15, %ecx +; SSE-NEXT: sarl $4, %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: leal (%rdx,%rdx,2), %ecx +; SSE-NEXT: shll $3, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: leal 32767(%rax), %ecx +; SSE-NEXT: testw %ax, %ax +; SSE-NEXT: cmovnsl %eax, %ecx +; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 +; SSE-NEXT: addl %eax, %ecx +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movswl %ax, %ecx +; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; SSE-NEXT: movl %ecx, %edx +; SSE-NEXT: shrl $31, %edx +; SSE-NEXT: sarl $26, %ecx +; SSE-NEXT: addl %edx, %ecx +; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_i16_smax: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: movswl %cx, %edx +; AVX-NEXT: shrl $15, %ecx +; AVX-NEXT: sarl $4, %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX-NEXT: shll $3, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: addl %eax, %edx +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: leal 32767(%rax), %ecx +; AVX-NEXT: testw %ax, %ax +; AVX-NEXT: cmovnsl %eax, %ecx +; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 +; AVX-NEXT: addl %eax, %ecx +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: movswl %ax, %ecx +; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 +; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: shrl $31, %edx +; AVX-NEXT: sarl $26, %ecx +; AVX-NEXT: addl %edx, %ecx +; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = srem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 srem. +define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_fold_srem_i64: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: addq %rcx, %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $4, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: leaq (%rdx,%rdx,2), %rax +; SSE-NEXT: shlq $3, %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %rcx, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: pextrq $1, %xmm2, %rcx +; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $11, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: imulq %rdx +; SSE-NEXT: movq %rdx, %rax +; SSE-NEXT: shrq $63, %rax +; SSE-NEXT: sarq $8, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX1-LABEL: dont_fold_srem_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $4, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax +; AVX1-NEXT: shlq $3, %rax +; AVX1-NEXT: subq %rax, %rdx +; AVX1-NEXT: addq %rcx, %rdx +; AVX1-NEXT: vmovq %rdx, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $11, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: imulq %rdx +; AVX1-NEXT: movq %rdx, %rax +; AVX1-NEXT: shrq $63, %rax +; AVX1-NEXT: sarq $8, %rdx +; AVX1-NEXT: addq %rax, %rdx +; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_fold_srem_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $4, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax +; AVX2-NEXT: shlq $3, %rax +; AVX2-NEXT: subq %rax, %rdx +; AVX2-NEXT: addq %rcx, %rdx +; AVX2-NEXT: vmovq %rdx, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $11, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: imulq %rdx +; AVX2-NEXT: movq %rdx, %rax +; AVX2-NEXT: shrq $63, %rax +; AVX2-NEXT: sarq $8, %rdx +; AVX2-NEXT: addq %rax, %rdx +; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %1 = srem <4 x i64> %x, + ret <4 x i64> %1 +} diff --git a/llvm/test/CodeGen/X86/switch-bt.ll b/llvm/test/CodeGen/X86/switch-bt.ll index 797ad4bccfd07c..965cdbf17f508d 100644 --- a/llvm/test/CodeGen/X86/switch-bt.ll +++ b/llvm/test/CodeGen/X86/switch-bt.ll @@ -157,13 +157,12 @@ sw.epilog: } -; TODO: Omit the range check when the default case is unreachable, see PR43129. +; Omit the range check when the default case is unreachable, see PR43129. declare void @g(i32) define void @test5(i32 %x) { ; CHECK-LABEL: test5 -; CHECK: cmpl $8, %edi -; CHECK: ja +; CHECK-NOT: cmp ; 73 = 2^0 + 2^3 + 2^6 ; CHECK: movl $73 diff --git a/llvm/test/CodeGen/X86/urem-lkk.ll b/llvm/test/CodeGen/X86/urem-lkk.ll new file mode 100644 index 00000000000000..84b0d0d68a0af9 --- /dev/null +++ b/llvm/test/CodeGen/X86/urem-lkk.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK + +define i32 @fold_urem_positive_odd(i32 %x) { +; CHECK-LABEL: fold_urem_positive_odd: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309 +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: subl %ecx, %edx +; CHECK-NEXT: shrl %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: shrl $6, %edx +; CHECK-NEXT: imull $95, %edx, %ecx +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 95 + ret i32 %1 +} + + +define i32 @fold_urem_positive_even(i32 %x) { +; CHECK-LABEL: fold_urem_positive_even: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: movl $4149100483, %edx # imm = 0xF74E3FC3 +; CHECK-NEXT: imulq %rcx, %rdx +; CHECK-NEXT: shrq $42, %rdx +; CHECK-NEXT: imull $1060, %edx, %ecx # imm = 0x424 +; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 1060 + ret i32 %1 +} + + +; Don't fold if we can combine urem with udiv. +define i32 @combine_urem_udiv(i32 %x) { +; CHECK-LABEL: combine_urem_udiv: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: imulq $1491936009, %rax, %rax # imm = 0x58ED2309 +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: subl %eax, %ecx +; CHECK-NEXT: shrl %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: shrl $6, %ecx +; CHECK-NEXT: imull $95, %ecx, %eax +; CHECK-NEXT: subl %eax, %edi +; CHECK-NEXT: leal (%rdi,%rcx), %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 95 + %2 = udiv i32 %x, 95 + %3 = add i32 %1, %2 + ret i32 %3 +} + +; Don't fold for divisors that are a power of two. +define i32 @dont_fold_urem_power_of_two(i32 %x) { +; CHECK-LABEL: dont_fold_urem_power_of_two: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: andl $63, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 64 + ret i32 %1 +} + +; Don't fold if the divisor is one. +define i32 @dont_fold_urem_one(i32 %x) { +; CHECK-LABEL: dont_fold_urem_one: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq + %1 = urem i32 %x, 1 + ret i32 %1 +} + +; Don't fold if the divisor is 2^32. +define i32 @dont_fold_urem_i32_umax(i32 %x) { +; CHECK-LABEL: dont_fold_urem_i32_umax: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %1 = urem i32 %x, 4294967296 + ret i32 %1 +} + +; Don't fold i64 urem +define i64 @dont_fold_urem_i64(i64 %x) { +; CHECK-LABEL: dont_fold_urem_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1 +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: shrq $4, %rdx +; CHECK-NEXT: imulq $98, %rdx, %rax +; CHECK-NEXT: subq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: retq + %1 = urem i64 %x, 98 + ret i64 %1 +} diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll new file mode 100644 index 00000000000000..65eb3557f4ce04 --- /dev/null +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -0,0 +1,378 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { +; SSE-LABEL: fold_urem_vec_1: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shrl $2, %ecx +; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 +; SSE-NEXT: shrl $19, %ecx +; SSE-NEXT: imull $124, %ecx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: movzwl %cx, %edx +; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 +; SSE-NEXT: shrl $22, %edx +; SSE-NEXT: imull $95, %edx, %edx +; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shrl %ecx +; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 +; SSE-NEXT: shrl $17, %ecx +; SSE-NEXT: imull $98, %ecx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: movl %eax, %edx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: movzwl %dx, %edx +; SSE-NEXT: shrl %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: shrl $9, %edx +; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_urem_vec_1: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $2, %ecx +; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 +; AVX-NEXT: shrl $19, %ecx +; AVX-NEXT: imull $124, %ecx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: movzwl %cx, %edx +; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 +; AVX-NEXT: shrl $22, %edx +; AVX-NEXT: imull $95, %edx, %edx +; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl %ecx +; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 +; AVX-NEXT: shrl $17, %ecx +; AVX-NEXT: imull $98, %ecx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: shrl %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: shrl $9, %edx +; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { +; SSE-LABEL: fold_urem_vec_2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psrlw $6, %xmm1 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: fold_urem_vec_2: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + + +; Don't fold if we can combine urem with udiv. +define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { +; SSE-LABEL: combine_urem_udiv: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psrlw $6, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_urem_udiv: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 +; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + %2 = udiv <4 x i16> %x, + %3 = add <4 x i16> %1, %2 + ret <4 x i16> %3 +} + +; Don't fold for divisors that are a power of two. +define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_power_of_two: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; SSE-NEXT: shrl $22, %ecx +; SSE-NEXT: imull $95, %ecx, %ecx +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pextrw $1, %xmm0, %ecx +; SSE-NEXT: andl $31, %ecx +; SSE-NEXT: movd %xmm0, %edx +; SSE-NEXT: andl $63, %edx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %ecx +; SSE-NEXT: andl $7, %ecx +; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_power_of_two: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; AVX-NEXT: shrl $22, %ecx +; AVX-NEXT: imull $95, %ecx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %ecx +; AVX-NEXT: andl $31, %ecx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: andl $63, %edx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: andl $7, %ecx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 +; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is one. +define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { +; SSE-LABEL: dont_fold_urem_one: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D +; SSE-NEXT: shrl $16, %ecx +; SSE-NEXT: movl %eax, %edx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: movzwl %dx, %edx +; SSE-NEXT: shrl %edx +; SSE-NEXT: addl %ecx, %edx +; SSE-NEXT: shrl $4, %edx +; SSE-NEXT: leal (%rdx,%rdx,2), %ecx +; SSE-NEXT: shll $3, %ecx +; SSE-NEXT: subl %ecx, %edx +; SSE-NEXT: addl %eax, %edx +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B +; SSE-NEXT: shrl $25, %ecx +; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 +; SSE-NEXT: shrl $26, %ecx +; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: pinsrw $3, %eax, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: dont_fold_urem_one: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: shrl %edx +; AVX-NEXT: addl %ecx, %edx +; AVX-NEXT: shrl $4, %edx +; AVX-NEXT: leal (%rdx,%rdx,2), %ecx +; AVX-NEXT: shll $3, %ecx +; AVX-NEXT: subl %ecx, %edx +; AVX-NEXT: addl %eax, %edx +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B +; AVX-NEXT: shrl $25, %ecx +; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 +; AVX-NEXT: shrl $26, %ecx +; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold if the divisor is 2^16. +define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { +; CHECK-LABEL: dont_fold_urem_i16_smax: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %1 = urem <4 x i16> %x, + ret <4 x i16> %1 +} + +; Don't fold i64 urem. +define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { +; SSE-LABEL: dont_fold_urem_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq %xmm1, %rcx +; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: mulq %rdx +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: addq %rdx, %rax +; SSE-NEXT: shrq $4, %rax +; SSE-NEXT: leaq (%rax,%rax,2), %rdx +; SSE-NEXT: shlq $3, %rdx +; SSE-NEXT: subq %rdx, %rax +; SSE-NEXT: addq %rcx, %rax +; SSE-NEXT: movq %rax, %xmm2 +; SSE-NEXT: pextrq $1, %xmm1, %rcx +; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: mulq %rdx +; SSE-NEXT: shrq $12, %rdx +; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: pextrq $1, %xmm0, %rcx +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: shrq %rax +; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; SSE-NEXT: mulq %rdx +; SSE-NEXT: shrq $7, %rdx +; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: movq %rcx, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: dont_fold_urem_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rcx +; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: addq %rdx, %rax +; AVX1-NEXT: shrq $4, %rax +; AVX1-NEXT: leaq (%rax,%rax,2), %rdx +; AVX1-NEXT: shlq $3, %rdx +; AVX1-NEXT: subq %rdx, %rax +; AVX1-NEXT: addq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm1, %rcx +; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: shrq $12, %rdx +; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rcx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrq %rax +; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX1-NEXT: mulq %rdx +; AVX1-NEXT: shrq $7, %rdx +; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX1-NEXT: subq %rax, %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: dont_fold_urem_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rcx +; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9 +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: addq %rdx, %rax +; AVX2-NEXT: shrq $4, %rax +; AVX2-NEXT: leaq (%rax,%rax,2), %rdx +; AVX2-NEXT: shlq $3, %rdx +; AVX2-NEXT: subq %rdx, %rax +; AVX2-NEXT: addq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm1, %rcx +; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: shrq $12, %rdx +; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5 +; AVX2-NEXT: mulq %rdx +; AVX2-NEXT: shrq $7, %rdx +; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E +; AVX2-NEXT: subq %rax, %rcx +; AVX2-NEXT: vmovq %rcx, %xmm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %1 = urem <4 x i64> %x, + ret <4 x i64> %1 +} \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll new file mode 100644 index 00000000000000..adb9eb2d49c29b --- /dev/null +++ b/llvm/test/CodeGen/X86/use-cr-result-of-dom-icmp-st.ll @@ -0,0 +1,627 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -O3 -verify-machineinstrs < %s | FileCheck %s + +; Test cases are generated from: +; long long NAME(PARAM a, PARAM b) { +; if (LHS > RHS) +; return b; +; if (LHS < RHS) +; return a;\ +; return a * b; +; } +; Please note funtion name is defined as __. Take ll_a_op_b__1 +; for example. ll is PARAM, a_op_b (i.e., a << b) is LHS, _1 (i.e., -1) is RHS. + +target datalayout = "e-m:e-i64:64-n32:64" + +define i64 @ll_a_op_b__2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shlq %cl, %rax +; CHECK-NEXT: cmpq $-2, %rax +; CHECK-NEXT: jle .LBB0_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, -2 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b__1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shlq %cl, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: js .LBB1_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB1_1: # %if.end +; CHECK-NEXT: cmpq $-1, %rax +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, -1 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b_0(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shlq %cl, %rax +; CHECK-NEXT: testq %rax, %rax +; CHECK-NEXT: jle .LBB2_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB2_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, 0 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b_1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shlq %cl, %rax +; CHECK-NEXT: cmpq $1, %rax +; CHECK-NEXT: jle .LBB3_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB3_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, 1 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_op_b_2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_op_b_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shlq %cl, %rax +; CHECK-NEXT: cmpq $2, %rax +; CHECK-NEXT: jle .LBB4_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB4_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i64 %a, %b + %cmp = icmp sgt i64 %shl, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i64 %shl, 2 + %mul = select i1 %cmp2, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a__2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpq $-2, %rdi +; CHECK-NEXT: jle .LBB5_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB5_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rsi, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i64 %a, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, -2 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a__1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: js .LBB6_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB6_1: # %if.end +; CHECK-NEXT: cmpq $-1, %rdi +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rsi, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i64 %a, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, -1 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_0(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: jle .LBB7_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB7_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rsi, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i64 %a, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, 0 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_1(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpq $1, %rdi +; CHECK-NEXT: jle .LBB8_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB8_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rsi, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i64 %a, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, 1 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @ll_a_2(i64 %a, i64 %b) { +; CHECK-LABEL: ll_a_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpq $2, %rdi +; CHECK-NEXT: jle .LBB9_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB9_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmoveq %rsi, %rax +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i64 %a, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i64 %a, 2 + %mul = select i1 %cmp1, i64 %b, i64 1 + %spec.select = mul nsw i64 %mul, %a + ret i64 %spec.select + +return: ; preds = %entry + ret i64 %b +} + +define i64 @i_a_op_b__2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: cmpl $-2, %eax +; CHECK-NEXT: jg .LBB10_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: .LBB10_2: # %return +; CHECK-NEXT: movslq %ecx, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, -2 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b__1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: js .LBB11_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movslq %ecx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB11_1: # %if.end +; CHECK-NEXT: cmpl $-1, %eax +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movslq %ecx, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, -1 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b_0(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: jle .LBB12_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movslq %ecx, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB12_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movslq %ecx, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, 0 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b_1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: cmpl $1, %eax +; CHECK-NEXT: jg .LBB13_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: .LBB13_2: # %return +; CHECK-NEXT: movslq %ecx, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, 1 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_op_b_2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_op_b_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shll %cl, %eax +; CHECK-NEXT: cmpl $2, %eax +; CHECK-NEXT: jg .LBB14_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %ecx, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: .LBB14_2: # %return +; CHECK-NEXT: movslq %ecx, %rax +; CHECK-NEXT: retq +entry: + %shl = shl i32 %a, %b + %cmp = icmp sgt i32 %shl, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp eq i32 %shl, 2 + %mul = select i1 %cmp2, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a__2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a__2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpl $-2, %edi +; CHECK-NEXT: jg .LBB15_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: .LBB15_2: # %return +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i32 %a, -2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, -2 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a__1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a__1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: js .LBB16_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB16_1: # %if.end +; CHECK-NEXT: cmpl $-1, %edi +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i32 %a, -1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, -1 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_0(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: jle .LBB17_1 +; CHECK-NEXT: # %bb.2: # %return +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB17_1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i32 %a, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, 0 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_1(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: jg .LBB18_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: .LBB18_2: # %return +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i32 %a, 1 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, 1 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} + +define i64 @i_a_2(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: i_a_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpl $2, %edi +; CHECK-NEXT: jg .LBB19_2 +; CHECK-NEXT: # %bb.1: # %if.end +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: imull %edi, %eax +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: .LBB19_2: # %return +; CHECK-NEXT: movslq %esi, %rax +; CHECK-NEXT: retq +entry: + %cmp = icmp sgt i32 %a, 2 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %cmp1 = icmp eq i32 %a, 2 + %mul = select i1 %cmp1, i32 %b, i32 1 + %spec.select = mul nsw i32 %mul, %a + br label %return + +return: ; preds = %if.end, %entry + %retval.0.in = phi i32 [ %b, %entry ], [ %spec.select, %if.end ] + %retval.0 = sext i32 %retval.0.in to i64 + ret i64 %retval.0 +} diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index b809e55ce5e2bb..436e48729f9f6f 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1730,20 +1730,26 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: smulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm1, %rdx +; AVX512-NEXT: vmovq %xmm0, %rsi ; AVX512-NEXT: imulq %rdx, %rsi -; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: seto %dl ; AVX512-NEXT: imulq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %edx, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -2197,46 +2203,76 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: smulo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 ; AVX512-NEXT: kmovd %k1, %r9d ; AVX512-NEXT: andb $1, %r9b ; AVX512-NEXT: negb %r9b -; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: negb %r10b ; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k1, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: kshiftrw $2, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: negb %r11b +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: negb %sil -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: negb %al +; AVX512-NEXT: kshiftrw $1, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %edx ; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: negb %dl +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: andb $1, %al +; AVX512-NEXT: negb %al +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: imulb %dl +; AVX512-NEXT: imulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %r8d, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %r8b, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: movl %r8d, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %r8b, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: imulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: imulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %dl, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: imulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: movl %esi, %ecx @@ -2246,26 +2282,22 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: imulb %r10b +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: imulb %r9b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %al, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2273,21 +2305,34 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 07899d0dddf6a1..c859ce7b74bb8b 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1532,21 +1532,28 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: umulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovq %xmm1, %rdx ; AVX512-NEXT: mulq %rdx -; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: seto %r9b ; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: mulq %rsi -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: mulq %r8 +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %r9d, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -1945,6 +1952,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: umulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 @@ -1956,40 +1964,60 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: kshiftrw $2, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil -; AVX512-NEXT: kshiftrw $2, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: andb $1, %al ; AVX512-NEXT: kmovd %k1, %ecx ; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: kmovd %k0, %edx -; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: mulb %dl +; AVX512-NEXT: mulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %r8b -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: mulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: mulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: testb $-2, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: mulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %sil ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 ; AVX512-NEXT: movl %r9d, %eax @@ -1997,8 +2025,8 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl ; AVX512-NEXT: testb $-2, %al -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2006,21 +2034,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll index e35e76d2f38a06..5f39f33dd6a5ca 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s -define <1 x float> @constrained_vector_fma_v1f32() { +define <1 x float> @constrained_vector_fma_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fma_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -14,11 +14,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %fma } -define <2 x double> @constrained_vector_fma_v2f64() { +define <2 x double> @constrained_vector_fma_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fma_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [1.5E+0,5.0E-1] @@ -31,11 +31,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %fma } -define <3 x float> @constrained_vector_fma_v3f32() { +define <3 x float> @constrained_vector_fma_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fma_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -56,11 +56,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %fma } -define <3 x double> @constrained_vector_fma_v3f64() { +define <3 x double> @constrained_vector_fma_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fma_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -77,11 +77,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %fma } -define <4 x double> @constrained_vector_fma_v4f64() { +define <4 x double> @constrained_vector_fma_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fma_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1] @@ -94,11 +94,11 @@ entry: <4 x double> , <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %fma } -define <4 x float> @constrained_vector_fma_v4f32() { +define <4 x float> @constrained_vector_fma_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fma_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1] @@ -111,11 +111,11 @@ entry: <4 x float> , <4 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %fma } -define <8 x float> @constrained_vector_fma_v8f32() { +define <8 x float> @constrained_vector_fma_v8f32() #0 { ; CHECK-LABEL: constrained_vector_fma_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1,7.5E+0,6.5E+0,5.5E+0,4.5E+0] @@ -131,10 +131,12 @@ entry: <8 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <8 x float> %fma } +attributes #0 = { strictfp } + ; Single width declarations declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 19b2b4864ea7ae..a742d4aa2e7dd1 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -2,7 +2,7 @@ ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx < %s | FileCheck --check-prefix=AVX %s -define <1 x float> @constrained_vector_fdiv_v1f32() { +define <1 x float> @constrained_vector_fdiv_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fdiv_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -19,11 +19,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %div } -define <2 x double> @constrained_vector_fdiv_v2f64() { +define <2 x double> @constrained_vector_fdiv_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fdiv_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] @@ -40,11 +40,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %div } -define <3 x float> @constrained_vector_fdiv_v3f32() { +define <3 x float> @constrained_vector_fdiv_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fdiv_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -75,11 +75,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %div } -define <3 x double> @constrained_vector_fdiv_v3f64() { +define <3 x double> @constrained_vector_fdiv_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fdiv_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0] @@ -105,11 +105,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %div } -define <4 x double> @constrained_vector_fdiv_v4f64() { +define <4 x double> @constrained_vector_fdiv_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fdiv_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm2 = [1.0E+1,1.0E+1] @@ -131,11 +131,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %div } -define <1 x float> @constrained_vector_frem_v1f32() { +define <1 x float> @constrained_vector_frem_v1f32() #0 { ; CHECK-LABEL: constrained_vector_frem_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -162,11 +162,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %rem } -define <2 x double> @constrained_vector_frem_v2f64() { +define <2 x double> @constrained_vector_frem_v2f64() #0 { ; CHECK-LABEL: constrained_vector_frem_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -205,11 +205,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %rem } -define <3 x float> @constrained_vector_frem_v3f32() { +define <3 x float> @constrained_vector_frem_v3f32() #0 { ; CHECK-LABEL: constrained_vector_frem_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -261,11 +261,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %rem } -define <3 x double> @constrained_vector_frem_v3f64() { +define <3 x double> @constrained_vector_frem_v3f64() #0 { ; CHECK-LABEL: constrained_vector_frem_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -319,11 +319,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %rem } -define <4 x double> @constrained_vector_frem_v4f64() { +define <4 x double> @constrained_vector_frem_v4f64() #0 { ; CHECK-LABEL: constrained_vector_frem_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp @@ -386,11 +386,11 @@ define <4 x double> @constrained_vector_frem_v4f64() { <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %rem } -define <1 x float> @constrained_vector_fmul_v1f32() { +define <1 x float> @constrained_vector_fmul_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fmul_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -407,11 +407,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %mul } -define <2 x double> @constrained_vector_fmul_v2f64() { +define <2 x double> @constrained_vector_fmul_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fmul_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] @@ -428,11 +428,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %mul } -define <3 x float> @constrained_vector_fmul_v3f32() { +define <3 x float> @constrained_vector_fmul_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fmul_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -460,11 +460,11 @@ entry: float 0x7FF0000000000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %mul } -define <3 x double> @constrained_vector_fmul_v3f64() { +define <3 x double> @constrained_vector_fmul_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fmul_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] @@ -491,11 +491,11 @@ entry: double 0x7FEFFFFFFFFFFFFF>, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %mul } -define <4 x double> @constrained_vector_fmul_v4f64() { +define <4 x double> @constrained_vector_fmul_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fmul_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] @@ -516,11 +516,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %mul } -define <1 x float> @constrained_vector_fadd_v1f32() { +define <1 x float> @constrained_vector_fadd_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fadd_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -537,11 +537,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %add } -define <2 x double> @constrained_vector_fadd_v2f64() { +define <2 x double> @constrained_vector_fadd_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fadd_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -563,11 +563,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %add } -define <3 x float> @constrained_vector_fadd_v3f32() { +define <3 x float> @constrained_vector_fadd_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fadd_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorps %xmm1, %xmm1 @@ -596,11 +596,11 @@ entry: float 0xFFFFFFFFE0000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %add } -define <3 x double> @constrained_vector_fadd_v3f64() { +define <3 x double> @constrained_vector_fadd_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fadd_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorpd %xmm2, %xmm2 @@ -629,11 +629,11 @@ entry: double 0x7FEFFFFFFFFFFFFF>, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %add } -define <4 x double> @constrained_vector_fadd_v4f64() { +define <4 x double> @constrained_vector_fadd_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fadd_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero @@ -666,11 +666,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %add } -define <1 x float> @constrained_vector_fsub_v1f32() { +define <1 x float> @constrained_vector_fsub_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fsub_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -687,11 +687,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %sub } -define <2 x double> @constrained_vector_fsub_v2f64() { +define <2 x double> @constrained_vector_fsub_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fsub_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -713,11 +713,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %sub } -define <3 x float> @constrained_vector_fsub_v3f32() { +define <3 x float> @constrained_vector_fsub_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fsub_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorps %xmm0, %xmm0 @@ -747,11 +747,11 @@ entry: float 0xFFFFFFFFE0000000>, <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %sub } -define <3 x double> @constrained_vector_fsub_v3f64() { +define <3 x double> @constrained_vector_fsub_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fsub_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xorpd %xmm0, %xmm0 @@ -781,11 +781,11 @@ entry: double 0xFFEFFFFFFFFFFFFF>, <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %sub } -define <4 x double> @constrained_vector_fsub_v4f64() { +define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fsub_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero @@ -818,11 +818,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %sub } -define <1 x float> @constrained_vector_sqrt_v1f32() { +define <1 x float> @constrained_vector_sqrt_v1f32() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -838,11 +838,11 @@ entry: %sqrt = call <1 x float> @llvm.experimental.constrained.sqrt.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %sqrt } -define <2 x double> @constrained_vector_sqrt_v2f64() { +define <2 x double> @constrained_vector_sqrt_v2f64() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 @@ -856,11 +856,11 @@ entry: %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %sqrt } -define <3 x float> @constrained_vector_sqrt_v3f32() { +define <3 x float> @constrained_vector_sqrt_v3f32() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -888,11 +888,11 @@ entry: %sqrt = call <3 x float> @llvm.experimental.constrained.sqrt.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %sqrt } -define <3 x double> @constrained_vector_sqrt_v3f64() { +define <3 x double> @constrained_vector_sqrt_v3f64() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -915,11 +915,11 @@ entry: %sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %sqrt } -define <4 x double> @constrained_vector_sqrt_v4f64() { +define <4 x double> @constrained_vector_sqrt_v4f64() #0 { ; CHECK-LABEL: constrained_vector_sqrt_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sqrtpd {{.*}}(%rip), %xmm0 @@ -935,11 +935,11 @@ define <4 x double> @constrained_vector_sqrt_v4f64() { <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %sqrt } -define <1 x float> @constrained_vector_pow_v1f32() { +define <1 x float> @constrained_vector_pow_v1f32() #0 { ; CHECK-LABEL: constrained_vector_pow_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -966,11 +966,11 @@ entry: <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %pow } -define <2 x double> @constrained_vector_pow_v2f64() { +define <2 x double> @constrained_vector_pow_v2f64() #0 { ; CHECK-LABEL: constrained_vector_pow_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1009,11 +1009,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %pow } -define <3 x float> @constrained_vector_pow_v3f32() { +define <3 x float> @constrained_vector_pow_v3f32() #0 { ; CHECK-LABEL: constrained_vector_pow_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1065,11 +1065,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %pow } -define <3 x double> @constrained_vector_pow_v3f64() { +define <3 x double> @constrained_vector_pow_v3f64() #0 { ; CHECK-LABEL: constrained_vector_pow_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1123,11 +1123,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %pow } -define <4 x double> @constrained_vector_pow_v4f64() { +define <4 x double> @constrained_vector_pow_v4f64() #0 { ; CHECK-LABEL: constrained_vector_pow_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1191,11 +1191,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %pow } -define <1 x float> @constrained_vector_powi_v1f32() { +define <1 x float> @constrained_vector_powi_v1f32() #0 { ; CHECK-LABEL: constrained_vector_powi_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -1222,11 +1222,11 @@ entry: <1 x float> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %powi } -define <2 x double> @constrained_vector_powi_v2f64() { +define <2 x double> @constrained_vector_powi_v2f64() #0 { ; CHECK-LABEL: constrained_vector_powi_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1265,11 +1265,11 @@ entry: <2 x double> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %powi } -define <3 x float> @constrained_vector_powi_v3f32() { +define <3 x float> @constrained_vector_powi_v3f32() #0 { ; CHECK-LABEL: constrained_vector_powi_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1321,11 +1321,11 @@ entry: <3 x float> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %powi } -define <3 x double> @constrained_vector_powi_v3f64() { +define <3 x double> @constrained_vector_powi_v3f64() #0 { ; CHECK-LABEL: constrained_vector_powi_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1379,11 +1379,11 @@ entry: <3 x double> , i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %powi } -define <4 x double> @constrained_vector_powi_v4f64() { +define <4 x double> @constrained_vector_powi_v4f64() #0 { ; CHECK-LABEL: constrained_vector_powi_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1446,11 +1446,11 @@ entry: double 42.3, double 42.4>, i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %powi } -define <1 x float> @constrained_vector_sin_v1f32() { +define <1 x float> @constrained_vector_sin_v1f32() #0 { ; CHECK-LABEL: constrained_vector_sin_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -1474,11 +1474,11 @@ entry: %sin = call <1 x float> @llvm.experimental.constrained.sin.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %sin } -define <2 x double> @constrained_vector_sin_v2f64() { +define <2 x double> @constrained_vector_sin_v2f64() #0 { ; CHECK-LABEL: constrained_vector_sin_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1512,11 +1512,11 @@ entry: %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %sin } -define <3 x float> @constrained_vector_sin_v3f32() { +define <3 x float> @constrained_vector_sin_v3f32() #0 { ; CHECK-LABEL: constrained_vector_sin_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1561,11 +1561,11 @@ entry: %sin = call <3 x float> @llvm.experimental.constrained.sin.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %sin } -define <3 x double> @constrained_vector_sin_v3f64() { +define <3 x double> @constrained_vector_sin_v3f64() #0 { ; CHECK-LABEL: constrained_vector_sin_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1612,11 +1612,11 @@ entry: %sin = call <3 x double> @llvm.experimental.constrained.sin.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %sin } -define <4 x double> @constrained_vector_sin_v4f64() { +define <4 x double> @constrained_vector_sin_v4f64() #0 { ; CHECK-LABEL: constrained_vector_sin_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1670,11 +1670,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %sin } -define <1 x float> @constrained_vector_cos_v1f32() { +define <1 x float> @constrained_vector_cos_v1f32() #0 { ; CHECK-LABEL: constrained_vector_cos_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -1698,11 +1698,11 @@ entry: %cos = call <1 x float> @llvm.experimental.constrained.cos.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %cos } -define <2 x double> @constrained_vector_cos_v2f64() { +define <2 x double> @constrained_vector_cos_v2f64() #0 { ; CHECK-LABEL: constrained_vector_cos_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1736,11 +1736,11 @@ entry: %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %cos } -define <3 x float> @constrained_vector_cos_v3f32() { +define <3 x float> @constrained_vector_cos_v3f32() #0 { ; CHECK-LABEL: constrained_vector_cos_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1785,11 +1785,11 @@ entry: %cos = call <3 x float> @llvm.experimental.constrained.cos.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %cos } -define <3 x double> @constrained_vector_cos_v3f64() { +define <3 x double> @constrained_vector_cos_v3f64() #0 { ; CHECK-LABEL: constrained_vector_cos_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1836,11 +1836,11 @@ entry: %cos = call <3 x double> @llvm.experimental.constrained.cos.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %cos } -define <4 x double> @constrained_vector_cos_v4f64() { +define <4 x double> @constrained_vector_cos_v4f64() #0 { ; CHECK-LABEL: constrained_vector_cos_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -1894,11 +1894,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %cos } -define <1 x float> @constrained_vector_exp_v1f32() { +define <1 x float> @constrained_vector_exp_v1f32() #0 { ; CHECK-LABEL: constrained_vector_exp_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -1922,11 +1922,11 @@ entry: %exp = call <1 x float> @llvm.experimental.constrained.exp.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %exp } -define <2 x double> @constrained_vector_exp_v2f64() { +define <2 x double> @constrained_vector_exp_v2f64() #0 { ; CHECK-LABEL: constrained_vector_exp_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -1960,11 +1960,11 @@ entry: %exp = call <2 x double> @llvm.experimental.constrained.exp.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %exp } -define <3 x float> @constrained_vector_exp_v3f32() { +define <3 x float> @constrained_vector_exp_v3f32() #0 { ; CHECK-LABEL: constrained_vector_exp_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2009,11 +2009,11 @@ entry: %exp = call <3 x float> @llvm.experimental.constrained.exp.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %exp } -define <3 x double> @constrained_vector_exp_v3f64() { +define <3 x double> @constrained_vector_exp_v3f64() #0 { ; CHECK-LABEL: constrained_vector_exp_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2060,11 +2060,11 @@ entry: %exp = call <3 x double> @llvm.experimental.constrained.exp.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %exp } -define <4 x double> @constrained_vector_exp_v4f64() { +define <4 x double> @constrained_vector_exp_v4f64() #0 { ; CHECK-LABEL: constrained_vector_exp_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2118,11 +2118,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %exp } -define <1 x float> @constrained_vector_exp2_v1f32() { +define <1 x float> @constrained_vector_exp2_v1f32() #0 { ; CHECK-LABEL: constrained_vector_exp2_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -2146,11 +2146,11 @@ entry: %exp2 = call <1 x float> @llvm.experimental.constrained.exp2.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %exp2 } -define <2 x double> @constrained_vector_exp2_v2f64() { +define <2 x double> @constrained_vector_exp2_v2f64() #0 { ; CHECK-LABEL: constrained_vector_exp2_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2184,11 +2184,11 @@ entry: %exp2 = call <2 x double> @llvm.experimental.constrained.exp2.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %exp2 } -define <3 x float> @constrained_vector_exp2_v3f32() { +define <3 x float> @constrained_vector_exp2_v3f32() #0 { ; CHECK-LABEL: constrained_vector_exp2_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2233,11 +2233,11 @@ entry: %exp2 = call <3 x float> @llvm.experimental.constrained.exp2.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %exp2 } -define <3 x double> @constrained_vector_exp2_v3f64() { +define <3 x double> @constrained_vector_exp2_v3f64() #0 { ; CHECK-LABEL: constrained_vector_exp2_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2284,11 +2284,11 @@ entry: %exp2 = call <3 x double> @llvm.experimental.constrained.exp2.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %exp2 } -define <4 x double> @constrained_vector_exp2_v4f64() { +define <4 x double> @constrained_vector_exp2_v4f64() #0 { ; CHECK-LABEL: constrained_vector_exp2_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2342,11 +2342,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %exp2 } -define <1 x float> @constrained_vector_log_v1f32() { +define <1 x float> @constrained_vector_log_v1f32() #0 { ; CHECK-LABEL: constrained_vector_log_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -2370,11 +2370,11 @@ entry: %log = call <1 x float> @llvm.experimental.constrained.log.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %log } -define <2 x double> @constrained_vector_log_v2f64() { +define <2 x double> @constrained_vector_log_v2f64() #0 { ; CHECK-LABEL: constrained_vector_log_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2408,11 +2408,11 @@ entry: %log = call <2 x double> @llvm.experimental.constrained.log.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %log } -define <3 x float> @constrained_vector_log_v3f32() { +define <3 x float> @constrained_vector_log_v3f32() #0 { ; CHECK-LABEL: constrained_vector_log_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2457,11 +2457,11 @@ entry: %log = call <3 x float> @llvm.experimental.constrained.log.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %log } -define <3 x double> @constrained_vector_log_v3f64() { +define <3 x double> @constrained_vector_log_v3f64() #0 { ; CHECK-LABEL: constrained_vector_log_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2508,11 +2508,11 @@ entry: %log = call <3 x double> @llvm.experimental.constrained.log.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %log } -define <4 x double> @constrained_vector_log_v4f64() { +define <4 x double> @constrained_vector_log_v4f64() #0 { ; CHECK-LABEL: constrained_vector_log_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2566,11 +2566,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %log } -define <1 x float> @constrained_vector_log10_v1f32() { +define <1 x float> @constrained_vector_log10_v1f32() #0 { ; CHECK-LABEL: constrained_vector_log10_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -2594,11 +2594,11 @@ entry: %log10 = call <1 x float> @llvm.experimental.constrained.log10.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %log10 } -define <2 x double> @constrained_vector_log10_v2f64() { +define <2 x double> @constrained_vector_log10_v2f64() #0 { ; CHECK-LABEL: constrained_vector_log10_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2632,11 +2632,11 @@ entry: %log10 = call <2 x double> @llvm.experimental.constrained.log10.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %log10 } -define <3 x float> @constrained_vector_log10_v3f32() { +define <3 x float> @constrained_vector_log10_v3f32() #0 { ; CHECK-LABEL: constrained_vector_log10_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2681,11 +2681,11 @@ entry: %log10 = call <3 x float> @llvm.experimental.constrained.log10.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %log10 } -define <3 x double> @constrained_vector_log10_v3f64() { +define <3 x double> @constrained_vector_log10_v3f64() #0 { ; CHECK-LABEL: constrained_vector_log10_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2732,11 +2732,11 @@ entry: %log10 = call <3 x double> @llvm.experimental.constrained.log10.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %log10 } -define <4 x double> @constrained_vector_log10_v4f64() { +define <4 x double> @constrained_vector_log10_v4f64() #0 { ; CHECK-LABEL: constrained_vector_log10_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2790,11 +2790,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %log10 } -define <1 x float> @constrained_vector_log2_v1f32() { +define <1 x float> @constrained_vector_log2_v1f32() #0 { ; CHECK-LABEL: constrained_vector_log2_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -2818,11 +2818,11 @@ entry: %log2 = call <1 x float> @llvm.experimental.constrained.log2.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %log2 } -define <2 x double> @constrained_vector_log2_v2f64() { +define <2 x double> @constrained_vector_log2_v2f64() #0 { ; CHECK-LABEL: constrained_vector_log2_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2856,11 +2856,11 @@ entry: %log2 = call <2 x double> @llvm.experimental.constrained.log2.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %log2 } -define <3 x float> @constrained_vector_log2_v3f32() { +define <3 x float> @constrained_vector_log2_v3f32() #0 { ; CHECK-LABEL: constrained_vector_log2_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -2905,11 +2905,11 @@ entry: %log2 = call <3 x float> @llvm.experimental.constrained.log2.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %log2 } -define <3 x double> @constrained_vector_log2_v3f64() { +define <3 x double> @constrained_vector_log2_v3f64() #0 { ; CHECK-LABEL: constrained_vector_log2_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -2956,11 +2956,11 @@ entry: %log2 = call <3 x double> @llvm.experimental.constrained.log2.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %log2 } -define <4 x double> @constrained_vector_log2_v4f64() { +define <4 x double> @constrained_vector_log2_v4f64() #0 { ; CHECK-LABEL: constrained_vector_log2_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3014,11 +3014,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %log2 } -define <1 x float> @constrained_vector_rint_v1f32() { +define <1 x float> @constrained_vector_rint_v1f32() #0 { ; CHECK-LABEL: constrained_vector_rint_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -3038,11 +3038,11 @@ entry: %rint = call <1 x float> @llvm.experimental.constrained.rint.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %rint } -define <2 x double> @constrained_vector_rint_v2f64() { +define <2 x double> @constrained_vector_rint_v2f64() #0 { ; CHECK-LABEL: constrained_vector_rint_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3066,11 +3066,11 @@ entry: %rint = call <2 x double> @llvm.experimental.constrained.rint.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %rint } -define <3 x float> @constrained_vector_rint_v3f32() { +define <3 x float> @constrained_vector_rint_v3f32() #0 { ; CHECK-LABEL: constrained_vector_rint_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3107,11 +3107,11 @@ define <3 x float> @constrained_vector_rint_v3f32() { %rint = call <3 x float> @llvm.experimental.constrained.rint.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %rint } -define <3 x double> @constrained_vector_rint_v3f64() { +define <3 x double> @constrained_vector_rint_v3f64() #0 { ; CHECK-LABEL: constrained_vector_rint_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3145,11 +3145,11 @@ entry: %rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %rint } -define <4 x double> @constrained_vector_rint_v4f64() { +define <4 x double> @constrained_vector_rint_v4f64() #0 { ; CHECK-LABEL: constrained_vector_rint_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3184,11 +3184,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %rint } -define <1 x float> @constrained_vector_nearbyint_v1f32() { +define <1 x float> @constrained_vector_nearbyint_v1f32() #0 { ; CHECK-LABEL: constrained_vector_nearbyint_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -3208,11 +3208,11 @@ entry: %nearby = call <1 x float> @llvm.experimental.constrained.nearbyint.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %nearby } -define <2 x double> @constrained_vector_nearbyint_v2f64() { +define <2 x double> @constrained_vector_nearbyint_v2f64() #0 { ; CHECK-LABEL: constrained_vector_nearbyint_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3236,11 +3236,11 @@ entry: %nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %nearby } -define <3 x float> @constrained_vector_nearbyint_v3f32() { +define <3 x float> @constrained_vector_nearbyint_v3f32() #0 { ; CHECK-LABEL: constrained_vector_nearbyint_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3277,11 +3277,11 @@ entry: %nearby = call <3 x float> @llvm.experimental.constrained.nearbyint.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %nearby } -define <3 x double> @constrained_vector_nearby_v3f64() { +define <3 x double> @constrained_vector_nearby_v3f64() #0 { ; CHECK-LABEL: constrained_vector_nearby_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3315,11 +3315,11 @@ entry: %nearby = call <3 x double> @llvm.experimental.constrained.nearbyint.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %nearby } -define <4 x double> @constrained_vector_nearbyint_v4f64() { +define <4 x double> @constrained_vector_nearbyint_v4f64() #0 { ; CHECK-LABEL: constrained_vector_nearbyint_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3354,11 +3354,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %nearby } -define <1 x float> @constrained_vector_maxnum_v1f32() { +define <1 x float> @constrained_vector_maxnum_v1f32() #0 { ; CHECK-LABEL: constrained_vector_maxnum_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -3384,11 +3384,11 @@ entry: %max = call <1 x float> @llvm.experimental.constrained.maxnum.v1f32( <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %max } -define <2 x double> @constrained_vector_maxnum_v2f64() { +define <2 x double> @constrained_vector_maxnum_v2f64() #0 { ; CHECK-LABEL: constrained_vector_maxnum_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3427,11 +3427,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %max } -define <3 x float> @constrained_vector_maxnum_v3f32() { +define <3 x float> @constrained_vector_maxnum_v3f32() #0 { ; CHECK-LABEL: constrained_vector_maxnum_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3483,11 +3483,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %max } -define <3 x double> @constrained_vector_max_v3f64() { +define <3 x double> @constrained_vector_max_v3f64() #0 { ; CHECK-LABEL: constrained_vector_max_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3541,11 +3541,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %max } -define <4 x double> @constrained_vector_maxnum_v4f64() { +define <4 x double> @constrained_vector_maxnum_v4f64() #0 { ; CHECK-LABEL: constrained_vector_maxnum_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3609,11 +3609,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %max } -define <1 x float> @constrained_vector_minnum_v1f32() { +define <1 x float> @constrained_vector_minnum_v1f32() #0 { ; CHECK-LABEL: constrained_vector_minnum_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -3639,11 +3639,11 @@ define <1 x float> @constrained_vector_minnum_v1f32() { %min = call <1 x float> @llvm.experimental.constrained.minnum.v1f32( <1 x float> , <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %min } -define <2 x double> @constrained_vector_minnum_v2f64() { +define <2 x double> @constrained_vector_minnum_v2f64() #0 { ; CHECK-LABEL: constrained_vector_minnum_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3682,11 +3682,11 @@ entry: <2 x double> , <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %min } -define <3 x float> @constrained_vector_minnum_v3f32() { +define <3 x float> @constrained_vector_minnum_v3f32() #0 { ; CHECK-LABEL: constrained_vector_minnum_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3738,11 +3738,11 @@ entry: <3 x float> , <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %min } -define <3 x double> @constrained_vector_min_v3f64() { +define <3 x double> @constrained_vector_min_v3f64() #0 { ; CHECK-LABEL: constrained_vector_min_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -3796,11 +3796,11 @@ entry: <3 x double> , <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %min } -define <4 x double> @constrained_vector_minnum_v4f64() { +define <4 x double> @constrained_vector_minnum_v4f64() #0 { ; CHECK-LABEL: constrained_vector_minnum_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -3864,11 +3864,11 @@ entry: <4 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %min } -define <1 x i32> @constrained_vector_fptosi_v1i32_v1f32() { +define <1 x i32> @constrained_vector_fptosi_v1i32_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i32_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -3881,11 +3881,11 @@ define <1 x i32> @constrained_vector_fptosi_v1i32_v1f32() { entry: %result = call <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f32( <1 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i32> %result } -define <2 x i32> @constrained_vector_fptosi_v2i32_v2f32() { +define <2 x i32> @constrained_vector_fptosi_v2i32_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i32_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -3905,11 +3905,11 @@ define <2 x i32> @constrained_vector_fptosi_v2i32_v2f32() { entry: %result = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f32( <2 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i32> %result } -define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() { +define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -3935,11 +3935,11 @@ entry: %result = call <3 x i32> @llvm.experimental.constrained.fptosi.v3i32.v3f32( <3 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i32> %result } -define <4 x i32> @constrained_vector_fptosi_v4i32_v4f32() { +define <4 x i32> @constrained_vector_fptosi_v4i32_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -3970,11 +3970,11 @@ entry: %result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32( <4 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i32> %result } -define <1 x i64> @constrained_vector_fptosi_v1i64_v1f32() { +define <1 x i64> @constrained_vector_fptosi_v1i64_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i64_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -3987,11 +3987,11 @@ define <1 x i64> @constrained_vector_fptosi_v1i64_v1f32() { entry: %result = call <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f32( <1 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i64> %result } -define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() { +define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -4012,11 +4012,11 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f32() { entry: %result = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32( <2 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %result } -define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() { +define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -4039,11 +4039,11 @@ entry: %result = call <3 x i64> @llvm.experimental.constrained.fptosi.v3i64.v3f32( <3 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i64> %result } -define <4 x i64> @constrained_vector_fptosi_v4i64_v4f32() { +define <4 x i64> @constrained_vector_fptosi_v4i64_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i64_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -4076,11 +4076,11 @@ entry: %result = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32( <4 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i64> %result } -define <1 x i32> @constrained_vector_fptosi_v1i32_v1f64() { +define <1 x i32> @constrained_vector_fptosi_v1i32_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i32_v1f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4093,12 +4093,12 @@ define <1 x i32> @constrained_vector_fptosi_v1i32_v1f64() { entry: %result = call <1 x i32> @llvm.experimental.constrained.fptosi.v1i32.v1f64( <1 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i32> %result } -define <2 x i32> @constrained_vector_fptosi_v2i32_v2f64() { +define <2 x i32> @constrained_vector_fptosi_v2i32_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i32_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4118,11 +4118,11 @@ define <2 x i32> @constrained_vector_fptosi_v2i32_v2f64() { entry: %result = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64( <2 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i32> %result } -define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() { +define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4148,11 +4148,11 @@ entry: %result = call <3 x i32> @llvm.experimental.constrained.fptosi.v3i32.v3f64( <3 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i32> %result } -define <4 x i32> @constrained_vector_fptosi_v4i32_v4f64() { +define <4 x i32> @constrained_vector_fptosi_v4i32_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4183,11 +4183,11 @@ entry: %result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64( <4 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i32> %result } -define <1 x i64> @constrained_vector_fptosi_v1i64_v1f64() { +define <1 x i64> @constrained_vector_fptosi_v1i64_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v1i64_v1f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4200,11 +4200,11 @@ define <1 x i64> @constrained_vector_fptosi_v1i64_v1f64() { entry: %result = call <1 x i64> @llvm.experimental.constrained.fptosi.v1i64.v1f64( <1 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i64> %result } -define <2 x i64> @constrained_vector_fptosi_v2i64_v2f64() { +define <2 x i64> @constrained_vector_fptosi_v2i64_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i64_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4225,11 +4225,11 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f64() { entry: %result = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64( <2 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %result } -define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() { +define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v3i64_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4252,11 +4252,11 @@ entry: %result = call <3 x i64> @llvm.experimental.constrained.fptosi.v3i64.v3f64( <3 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i64> %result } -define <4 x i64> @constrained_vector_fptosi_v4i64_v4f64() { +define <4 x i64> @constrained_vector_fptosi_v4i64_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i64_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4289,11 +4289,11 @@ entry: %result = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f64( <4 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i64> %result } -define <1 x i32> @constrained_vector_fptoui_v1i32_v1f32() { +define <1 x i32> @constrained_vector_fptoui_v1i32_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i32_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -4306,11 +4306,11 @@ define <1 x i32> @constrained_vector_fptoui_v1i32_v1f32() { entry: %result = call <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f32( <1 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i32> %result } -define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() { +define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -4330,11 +4330,11 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() { entry: %result = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f32( <2 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i32> %result } -define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() { +define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -4360,11 +4360,11 @@ entry: %result = call <3 x i32> @llvm.experimental.constrained.fptoui.v3i32.v3f32( <3 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i32> %result } -define <4 x i32> @constrained_vector_fptoui_v4i32_v4f32() { +define <4 x i32> @constrained_vector_fptoui_v4i32_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i32_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax @@ -4395,11 +4395,11 @@ entry: %result = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32( <4 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i32> %result } -define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() { +define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -4412,11 +4412,11 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() { entry: %result = call <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f32( <1 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i64> %result } -define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() { +define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -4437,11 +4437,11 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() { entry: %result = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32( <2 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %result } -define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() { +define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -4464,11 +4464,11 @@ entry: %result = call <3 x i64> @llvm.experimental.constrained.fptoui.v3i64.v3f32( <3 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i64> %result } -define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() { +define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttss2si {{.*}}(%rip), %rax @@ -4501,11 +4501,11 @@ entry: %result = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32( <4 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i64> %result } -define <1 x i32> @constrained_vector_fptoui_v1i32_v1f64() { +define <1 x i32> @constrained_vector_fptoui_v1i32_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i32_v1f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4518,11 +4518,11 @@ define <1 x i32> @constrained_vector_fptoui_v1i32_v1f64() { entry: %result = call <1 x i32> @llvm.experimental.constrained.fptoui.v1i32.v1f64( <1 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i32> %result } -define <2 x i32> @constrained_vector_fptoui_v2i32_v2f64() { +define <2 x i32> @constrained_vector_fptoui_v2i32_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i32_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4542,11 +4542,11 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f64() { entry: %result = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64( <2 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i32> %result } -define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() { +define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4572,11 +4572,11 @@ entry: %result = call <3 x i32> @llvm.experimental.constrained.fptoui.v3i32.v3f64( <3 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i32> %result } -define <4 x i32> @constrained_vector_fptoui_v4i32_v4f64() { +define <4 x i32> @constrained_vector_fptoui_v4i32_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i32_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax @@ -4607,11 +4607,11 @@ entry: %result = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f64( <4 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i32> %result } -define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() { +define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4624,11 +4624,11 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() { entry: %result = call <1 x i64> @llvm.experimental.constrained.fptoui.v1i64.v1f64( <1 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x i64> %result } -define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() { +define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4649,11 +4649,11 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() { entry: %result = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64( <2 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x i64> %result } -define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() { +define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4676,11 +4676,11 @@ entry: %result = call <3 x i64> @llvm.experimental.constrained.fptoui.v3i64.v3f64( <3 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x i64> %result } -define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() { +define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %rax @@ -4713,12 +4713,12 @@ entry: %result = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f64( <4 x double>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x i64> %result } -define <1 x float> @constrained_vector_fptrunc_v1f64() { +define <1 x float> @constrained_vector_fptrunc_v1f64() #0 { ; CHECK-LABEL: constrained_vector_fptrunc_v1f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -4734,11 +4734,11 @@ entry: %result = call <1 x float> @llvm.experimental.constrained.fptrunc.v1f32.v1f64( <1 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %result } -define <2 x float> @constrained_vector_fptrunc_v2f64() { +define <2 x float> @constrained_vector_fptrunc_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptrunc_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -4760,11 +4760,11 @@ entry: %result = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64( <2 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x float> %result } -define <3 x float> @constrained_vector_fptrunc_v3f64() { +define <3 x float> @constrained_vector_fptrunc_v3f64() #0 { ; CHECK-LABEL: constrained_vector_fptrunc_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -4793,11 +4793,11 @@ entry: <3 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %result } -define <4 x float> @constrained_vector_fptrunc_v4f64() { +define <4 x float> @constrained_vector_fptrunc_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptrunc_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero @@ -4822,11 +4822,11 @@ entry: <4 x double>, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x float> %result } -define <1 x double> @constrained_vector_fpext_v1f32() { +define <1 x double> @constrained_vector_fpext_v1f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -4841,11 +4841,11 @@ define <1 x double> @constrained_vector_fpext_v1f32() { entry: %result = call <1 x double> @llvm.experimental.constrained.fpext.v1f64.v1f32( <1 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x double> %result } -define <2 x double> @constrained_vector_fpext_v2f32() { +define <2 x double> @constrained_vector_fpext_v2f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v2f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -4866,11 +4866,11 @@ define <2 x double> @constrained_vector_fpext_v2f32() { entry: %result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32( <2 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %result } -define <3 x double> @constrained_vector_fpext_v3f32() { +define <3 x double> @constrained_vector_fpext_v3f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -4898,11 +4898,11 @@ entry: %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32( <3 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %result } -define <4 x double> @constrained_vector_fpext_v4f32() { +define <4 x double> @constrained_vector_fpext_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fpext_v4f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -4925,11 +4925,11 @@ entry: %result = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32( <4 x float>, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <4 x double> %result } -define <1 x float> @constrained_vector_ceil_v1f32() { +define <1 x float> @constrained_vector_ceil_v1f32() #0 { ; CHECK-LABEL: constrained_vector_ceil_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -4949,11 +4949,11 @@ entry: %ceil = call <1 x float> @llvm.experimental.constrained.ceil.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %ceil } -define <2 x double> @constrained_vector_ceil_v2f64() { +define <2 x double> @constrained_vector_ceil_v2f64() #0 { ; CHECK-LABEL: constrained_vector_ceil_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -4977,11 +4977,11 @@ entry: %ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %ceil } -define <3 x float> @constrained_vector_ceil_v3f32() { +define <3 x float> @constrained_vector_ceil_v3f32() #0 { ; CHECK-LABEL: constrained_vector_ceil_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -5018,11 +5018,11 @@ entry: %ceil = call <3 x float> @llvm.experimental.constrained.ceil.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %ceil } -define <3 x double> @constrained_vector_ceil_v3f64() { +define <3 x double> @constrained_vector_ceil_v3f64() #0 { ; CHECK-LABEL: constrained_vector_ceil_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -5056,11 +5056,11 @@ entry: %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %ceil } -define <1 x float> @constrained_vector_floor_v1f32() { +define <1 x float> @constrained_vector_floor_v1f32() #0 { ; CHECK-LABEL: constrained_vector_floor_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -5080,12 +5080,12 @@ entry: %floor = call <1 x float> @llvm.experimental.constrained.floor.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %floor } -define <2 x double> @constrained_vector_floor_v2f64() { +define <2 x double> @constrained_vector_floor_v2f64() #0 { ; CHECK-LABEL: constrained_vector_floor_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -5109,11 +5109,11 @@ entry: %floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %floor } -define <3 x float> @constrained_vector_floor_v3f32() { +define <3 x float> @constrained_vector_floor_v3f32() #0 { ; CHECK-LABEL: constrained_vector_floor_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -5150,11 +5150,11 @@ entry: %floor = call <3 x float> @llvm.experimental.constrained.floor.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %floor } -define <3 x double> @constrained_vector_floor_v3f64() { +define <3 x double> @constrained_vector_floor_v3f64() #0 { ; CHECK-LABEL: constrained_vector_floor_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -5188,11 +5188,11 @@ entry: %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %floor } -define <1 x float> @constrained_vector_round_v1f32() { +define <1 x float> @constrained_vector_round_v1f32() #0 { ; CHECK-LABEL: constrained_vector_round_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -5216,11 +5216,11 @@ entry: %round = call <1 x float> @llvm.experimental.constrained.round.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %round } -define <2 x double> @constrained_vector_round_v2f64() { +define <2 x double> @constrained_vector_round_v2f64() #0 { ; CHECK-LABEL: constrained_vector_round_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -5254,11 +5254,11 @@ entry: %round = call <2 x double> @llvm.experimental.constrained.round.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %round } -define <3 x float> @constrained_vector_round_v3f32() { +define <3 x float> @constrained_vector_round_v3f32() #0 { ; CHECK-LABEL: constrained_vector_round_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -5303,12 +5303,12 @@ entry: %round = call <3 x float> @llvm.experimental.constrained.round.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %round } -define <3 x double> @constrained_vector_round_v3f64() { +define <3 x double> @constrained_vector_round_v3f64() #0 { ; CHECK-LABEL: constrained_vector_round_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -5355,11 +5355,11 @@ entry: %round = call <3 x double> @llvm.experimental.constrained.round.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %round } -define <1 x float> @constrained_vector_trunc_v1f32() { +define <1 x float> @constrained_vector_trunc_v1f32() #0 { ; CHECK-LABEL: constrained_vector_trunc_v1f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax @@ -5379,11 +5379,11 @@ entry: %trunc = call <1 x float> @llvm.experimental.constrained.trunc.v1f32( <1 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <1 x float> %trunc } -define <2 x double> @constrained_vector_trunc_v2f64() { +define <2 x double> @constrained_vector_trunc_v2f64() #0 { ; CHECK-LABEL: constrained_vector_trunc_v2f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -5407,11 +5407,11 @@ entry: %trunc = call <2 x double> @llvm.experimental.constrained.trunc.v2f64( <2 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <2 x double> %trunc } -define <3 x float> @constrained_vector_trunc_v3f32() { +define <3 x float> @constrained_vector_trunc_v3f32() #0 { ; CHECK-LABEL: constrained_vector_trunc_v3f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp @@ -5448,11 +5448,11 @@ entry: %trunc = call <3 x float> @llvm.experimental.constrained.trunc.v3f32( <3 x float> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x float> %trunc } -define <3 x double> @constrained_vector_trunc_v3f64() { +define <3 x double> @constrained_vector_trunc_v3f64() #0 { ; CHECK-LABEL: constrained_vector_trunc_v3f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $24, %rsp @@ -5486,10 +5486,11 @@ entry: %trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64( <3 x double> , metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret <3 x double> %trunc } +attributes #0 = { strictfp } ; Single width declarations declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index a0306dc1cd4cee..b0d6a20bdf38b2 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -223,8 +223,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 @@ -243,8 +242,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 @@ -1660,10 +1658,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { ; ; AVX512-LABEL: trunc_packus_v8i64_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -2735,20 +2732,57 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_packus_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_packus_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_packus_v16i64_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512VL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_packus_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_packus_v16i64_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512BWVL-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = icmp slt <16 x i64> %a0, %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> %3 = icmp sgt <16 x i64> %2, zeroinitializer @@ -2792,10 +2826,9 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512VL-LABEL: trunc_packus_v8i32_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2811,10 +2844,9 @@ define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index bb734bb8e32908..774a478a5d72df 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -233,10 +233,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -251,10 +249,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper @@ -1571,9 +1567,7 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) { ; ; AVX512-LABEL: trunc_ssat_v8i64_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovsqb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -2723,20 +2717,51 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] -; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovsqb %zmm1, %xmm1 +; AVX512VL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] +; AVX512BW-NEXT: vpminsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_ssat_v16i64_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsqb %zmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = icmp slt <16 x i64> %a0, %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> %3 = icmp sgt <16 x i64> %2, @@ -2780,9 +2805,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovsdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2798,9 +2821,7 @@ define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovsdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index f1c19d91cd3239..7489d393585ffb 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1175,8 +1175,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { ; ; AVX512-LABEL: trunc_usat_v8i64_v8i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp ult <8 x i64> %a0, @@ -1843,17 +1842,45 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_usat_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_usat_v16i64_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpminuq %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: trunc_usat_v16i64_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512VL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: trunc_usat_v16i64_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpminuq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: trunc_usat_v16i64_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovusqb %zmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %1 = icmp ult <16 x i64> %a0, %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> %3 = trunc <16 x i64> %2 to <16 x i8> @@ -1952,8 +1979,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512VL-LABEL: trunc_usat_v8i32_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1967,8 +1993,7 @@ define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { ; ; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp ult <8 x i32> %a0, diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-bad-transfer.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-bad-transfer.mir new file mode 100644 index 00000000000000..1d978b9c455320 --- /dev/null +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-bad-transfer.mir @@ -0,0 +1,103 @@ +# RUN: llc %s -mtriple=x86_64-unknown-unknown -o - -run-pass=livedebugvalues | FileCheck %s --implicit-check-not=DBG_VALUE +# +# Test that the DBG_VALUE of ecx below does not get propagated. It is considered +# live-in on LiveDebugValues' first pass through the loop, but on the second it +# should be removed from the InLocs set because it gets clobbered inside the +# loop. There should be no transfer from ecx to ebx -- this is ensured by the +# FileCheck implicit-check-not option. +# +# FIXME: we successfully prevent the false location (ebx) from being +# propagated into block 2, but the original transfer isn't yet eliminated. +# Thus we get no DBG_VALUe in block 2, but an invalid one in block 1. +# +# CHECK-LABEL: name: foo +# CHECK-LABEL: bb.0.entry: +# CHECK: $ecx = MOV32ri 0 +# CHECK-NEXT: DBG_VALUE +# CHECK-LABEL: bb.1.loop: +# CHECK: $ebx = COPY killed $ecx +# CHECK-NEXT: DBG_VALUE + +--- | + source_filename = "live-debug-values-remove-range.ll" + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + + declare void @llvm.dbg.value(metadata, metadata, metadata) + + define i32 @foo(i32 %bar) !dbg !4 { + entry: + br label %loop + loop: + br label %loop + exit: + ret i32 %bar + } + + !llvm.module.flags = !{!0, !1} + !llvm.dbg.cu = !{!2} + + !0 = !{i32 2, !"Debug Info Version", i32 3} + !1 = !{i32 2, !"Dwarf Version", i32 4} + !2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "beards", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) + !3 = !DIFile(filename: "bees.cpp", directory: ".") + !4 = distinct !DISubprogram(name: "nope", scope: !3, file: !3, line: 1, type: !5, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !8) + !5 = !DISubroutineType(types: !6) + !6 = !{!7} + !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) + !8 = !{!9} + !9 = !DILocalVariable(name: "thin", scope: !4, file: !3, line: 1, type: !7) + !10 = !DILocation(line: 1, scope: !4) + +... +--- +name: foo +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$edi' } +frameInfo: + stackSize: 8 + offsetAdjustment: -8 + maxAlignment: 1 + adjustsStack: true + hasCalls: true + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 8 +fixedStack: + - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '$rbx' } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $edi, $rbx + + frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 16 + CFI_INSTRUCTION offset $rbx, -16 + $ebx = MOV32rr $edi + $eax = MOV32ri 0 + $ecx = MOV32ri 0 + DBG_VALUE $ecx, $noreg, !9, !DIExpression(), debug-location !10 + $edi = MOV32ri 0 + $esi = MOV32ri 0 + + bb.1.loop: + successors: %bb.1, %bb.2 + liveins: $ebx, $eax, $ecx, $edi, $esi + + $eax = COPY $ecx + $ebx = COPY killed $ecx + $ecx = COPY killed $edi + $edi = COPY killed $esi + $esi = MOV32ri 1 + TEST8ri killed renamable $al, 1, implicit-def $eflags + JCC_1 %bb.1, 5, implicit killed $eflags + + bb.2.exit: + liveins: $ebx + + $eax = MOV32rr killed $ebx + $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 8 + RETQ $eax + +... diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-restore.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-restore.mir index aedb31371bd7c6..ded90d4a681514 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-restore.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-restore.mir @@ -7,6 +7,7 @@ # 'g': test for a crash from PR42773 # 'h': complex expressions should be restored # 'i': spills should be restored across block boundaries +# 'j': indirect DBG_VALUEs should be indirect after restoration # #define FORCE_SPILL() \ # __asm volatile("" : : : \ @@ -24,6 +25,7 @@ # CHECK: ![[QVAR:[0-9]+]] = !DILocalVariable(name: "q", # CHECK: ![[RVAR:[0-9]+]] = !DILocalVariable(name: "r", # CHECK: ![[SVAR:[0-9]+]] = !DILocalVariable(name: "s", +# CHECK: ![[TVAR:[0-9]+]] = !DILocalVariable(name: "t", # Ascertain that the spill has been recognized and manifested in a DBG_VALUE. # CHECK: MOV64mr $rsp,{{.*-8.*}}killed{{.*}}$rdi :: (store 8 into %stack.0) @@ -81,7 +83,7 @@ %0 = load i32, i32* %add.ptr, align 4, !dbg !223, !tbaa !24 ret i32 %0, !dbg !228 } - + define dso_local i32 @i(i32* readonly %p) local_unnamed_addr !dbg !307 { entry: br label %foo @@ -101,6 +103,25 @@ ret i32 %0, !dbg !328 } + define dso_local i32 @j(i32* readonly %p) local_unnamed_addr !dbg !402 { + entry: + br label %foo + + foo: + call void @llvm.dbg.value(metadata i32* %p, metadata !404, metadata !DIExpression()), !dbg !405 + %tobool = icmp eq i32* %p, null, !dbg !406 + br i1 %tobool, label %if.end, label %if.then, !dbg !408 + + if.then: ; preds = %entry + tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{dirflag},~{fpsr},~{flags}"(), !dbg !409, !srcloc !411 + br label %if.end, !dbg !412 + + if.end: ; preds = %entry, %if.then + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1, !dbg !413 + %0 = load i32, i32* %add.ptr, align 4, !dbg !414, !tbaa !24 + ret i32 %0, !dbg !415 + } + declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} @@ -181,6 +202,21 @@ !322 = !DILocation(line: 109, column: 14, scope: !307) !323 = !DILocation(line: 109, column: 10, scope: !307) !328 = !DILocation(line: 109, column: 3, scope: !307) + !401 = !DIBasicType(name: "looong int", size: 64, encoding: DW_ATE_signed) + !402 = distinct !DISubprogram(name: "j", scope: !0, file: !1, line: 105, type: !8, scopeLine: 105, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !403) + !403 = !{!404} + !404 = !DILocalVariable(name: "t", arg: 1, scope: !402, file: !1, line: 105, type: !401) + !405 = !DILocation(line: 105, column: 12, scope: !402) + !406 = !DILocation(line: 106, column: 7, scope: !407) + !407 = distinct !DILexicalBlock(scope: !402, file: !1, line: 106, column: 7) + !408 = !DILocation(line: 106, column: 7, scope: !402) + !409 = !DILocation(line: 107, column: 5, scope: !410) + !410 = distinct !DILexicalBlock(scope: !407, file: !1, line: 106, column: 10) + !411 = !{i32 -2147471544} + !412 = !DILocation(line: 108, column: 3, scope: !410) + !413 = !DILocation(line: 109, column: 14, scope: !402) + !414 = !DILocation(line: 109, column: 10, scope: !402) + !415 = !DILocation(line: 109, column: 3, scope: !402) ... --- name: f @@ -595,3 +631,106 @@ body: | RETQ $eax, debug-location !328 ... +--- +# Test that if an unspilt DBG_VALUE starts as an indirect DBG_VALUE, then it +# is restored as an indirect DBG_VALUE. FIXME: Note that for the intervening +# period of being a spilt location there is still a missing layer of +# indirection. + +# CHECK-LABEL: name: j +# CHECK-LABEL: bb.0.entry: +# CHECK: DBG_VALUE $rdi, 0, ![[TVAR]], !DIExpression() +# CHECK-LABEL: bb.1.if.then: +# CHECK: DBG_VALUE $rsp, 0, ![[TVAR]], !DIExpression(DW_OP_constu, 8, DW_OP_minus) +# CHECK: INLINEASM +# CHECK: DBG_VALUE ${{[a-zA-Z0-9]+}}, 0, ![[TVAR]], !DIExpression() +# CHECK-LABEL: bb.2.if.end + +name: j +tracksRegLiveness: true +liveins: + - { reg: '$rdi', virtual-reg: '' } +frameInfo: + stackSize: 48 + offsetAdjustment: -48 + maxAlignment: 8 + cvBytesOfCalleeSavedRegisters: 48 +fixedStack: + - { id: 0, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: default, + callee-saved-register: '$rbx', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } + - { id: 1, type: spill-slot, offset: -48, size: 8, alignment: 16, stack-id: default, + callee-saved-register: '$r12', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } + - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: default, + callee-saved-register: '$r13', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } + - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, stack-id: default, + callee-saved-register: '$r14', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } + - { id: 4, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default, + callee-saved-register: '$r15', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } + - { id: 5, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default, + callee-saved-register: '$rbp', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +body: | + bb.0.entry: + successors: %bb.2, %bb.1 + liveins: $rdi, $rbx, $r12, $r13, $r14, $r15, $rbp + + DBG_VALUE $rdi, 0, !404, !DIExpression(), debug-location !405 + DBG_VALUE $rdi, 0, !404, !DIExpression(), debug-location !405 + TEST64rr renamable $rdi, renamable $rdi, implicit-def $eflags, debug-location !406 + JCC_1 %bb.2, 4, implicit $eflags, debug-location !408 + + bb.1.if.then: + successors: %bb.2 + liveins: $rdi, $rbp, $r15, $r14, $r13, $r12, $rbx + + frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 16 + frame-setup PUSH64r killed $r15, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 24 + frame-setup PUSH64r killed $r14, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 32 + frame-setup PUSH64r killed $r13, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 40 + frame-setup PUSH64r killed $r12, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 48 + frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 56 + CFI_INSTRUCTION offset $rbx, -56 + CFI_INSTRUCTION offset $r12, -48 + CFI_INSTRUCTION offset $r13, -40 + CFI_INSTRUCTION offset $r14, -32 + CFI_INSTRUCTION offset $r15, -24 + CFI_INSTRUCTION offset $rbp, -16 + MOV64mr $rsp, 1, $noreg, -8, $noreg, killed renamable $rdi :: (store 8 into %stack.0) + INLINEASM &"", 1, 12, implicit-def dead early-clobber $rax, 12, implicit-def dead early-clobber $rbx, 12, implicit-def dead early-clobber $rcx, 12, implicit-def dead early-clobber $rdx, 12, implicit-def dead early-clobber $rsi, 12, implicit-def dead early-clobber $rdi, 12, implicit-def dead early-clobber $rbp, 12, implicit-def dead early-clobber $r8, 12, implicit-def dead early-clobber $r9, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15, 12, implicit-def dead early-clobber $eflags, !20, debug-location !409 + renamable $rdi = MOV64rm $rsp, 1, $noreg, -8, $noreg :: (load 8 from %stack.0) + $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 48 + $r12 = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 40 + $r13 = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 32 + $r14 = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 24 + $r15 = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 16 + $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CFI_INSTRUCTION def_cfa_offset 8 + + bb.2.if.end: + liveins: $rdi, $rbx, $r12, $r13, $r14, $r15, $rbp + + renamable $eax = MOV32rm killed renamable $rdi, 1, $noreg, 4, $noreg, debug-location !414 :: (load 4 from %ir.add.ptr, !tbaa !24) + RETQ $eax, debug-location !415 + +... diff --git a/llvm/test/ExecutionEngine/JITLink/X86/MachO_zero_fill_alignment.s b/llvm/test/ExecutionEngine/JITLink/X86/MachO_zero_fill_alignment.s index 3b3a3853fdce46..b65b0cb6f59d94 100644 --- a/llvm/test/ExecutionEngine/JITLink/X86/MachO_zero_fill_alignment.s +++ b/llvm/test/ExecutionEngine/JITLink/X86/MachO_zero_fill_alignment.s @@ -1,14 +1,14 @@ # RUN: rm -rf %t && mkdir -p %t # RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t/macho_zero_fill_align.o %s -# RUN: llvm-jitlink -noexec %t/macho_zero_fill_align.o -entry higher_zero_fill_align +# RUN: llvm-jitlink -noexec %t/macho_zero_fill_align.o -entry _higher_zero_fill_align .section __DATA,__data - .globl low_aligned_data + .globl _low_aligned_data .p2align 0 -low_aligned_data: +_low_aligned_data: .byte 42 - .globl higher_zero_fill_align -.zerofill __DATA,__zero_fill,higher_zero_fill_align,8,3 + .globl _higher_zero_fill_align +.zerofill __DATA,__zero_fill,_higher_zero_fill_align,8,3 .subsections_via_symbols diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll index 40641472d1c723..616897c3a00232 100644 --- a/llvm/test/Feature/fp-intrinsics.ll +++ b/llvm/test/Feature/fp-intrinsics.ll @@ -3,13 +3,13 @@ ; Test to verify that constants aren't folded when the rounding mode is unknown. ; CHECK-LABEL: @f1 ; CHECK: call double @llvm.experimental.constrained.fdiv.f64 -define double @f1() { +define double @f1() #0 { entry: %div = call double @llvm.experimental.constrained.fdiv.f64( double 1.000000e+00, double 1.000000e+01, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %div } @@ -23,12 +23,12 @@ entry: ; ; CHECK-LABEL: @f2 ; CHECK: call double @llvm.experimental.constrained.fsub.f64 -define double @f2(double %a) { +define double @f2(double %a) #0 { entry: %div = call double @llvm.experimental.constrained.fsub.f64( double %a, double 0.000000e+00, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %div } @@ -45,21 +45,21 @@ entry: ; CHECK: call double @llvm.experimental.constrained.fsub.f64 ; CHECK: call double @llvm.experimental.constrained.fmul.f64 ; CHECK: call double @llvm.experimental.constrained.fsub.f64 -define double @f3(double %a, double %b) { +define double @f3(double %a, double %b) #0 { entry: %sub = call double @llvm.experimental.constrained.fsub.f64( double -0.000000e+00, double %a, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %mul = call double @llvm.experimental.constrained.fmul.f64( double %sub, double %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 %ret = call double @llvm.experimental.constrained.fsub.f64( double -0.000000e+00, double %mul, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %ret } @@ -77,7 +77,7 @@ entry: ; CHECK-LABEL: @f4 ; CHECK-NOT: select ; CHECK: br i1 %cmp -define double @f4(i32 %n, double %a) { +define double @f4(i32 %n, double %a) #0 { entry: %cmp = icmp sgt i32 %n, 0 br i1 %cmp, label %if.then, label %if.end @@ -86,7 +86,7 @@ if.then: %add = call double @llvm.experimental.constrained.fadd.f64( double 1.000000e+00, double %a, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 br label %if.end if.end: @@ -97,123 +97,123 @@ if.end: ; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f5 ; CHECK: call double @llvm.experimental.constrained.sqrt -define double @f5() { +define double @f5() #0 { entry: %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f6 ; CHECK: call double @llvm.experimental.constrained.pow -define double @f6() { +define double @f6() #0 { entry: %result = call double @llvm.experimental.constrained.pow.f64(double 42.1, double 3.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f7 ; CHECK: call double @llvm.experimental.constrained.powi -define double @f7() { +define double @f7() #0 { entry: %result = call double @llvm.experimental.constrained.powi.f64(double 42.1, i32 3, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that sin(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f8 ; CHECK: call double @llvm.experimental.constrained.sin -define double @f8() { +define double @f8() #0 { entry: %result = call double @llvm.experimental.constrained.sin.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that cos(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f9 ; CHECK: call double @llvm.experimental.constrained.cos -define double @f9() { +define double @f9() #0 { entry: %result = call double @llvm.experimental.constrained.cos.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f10 ; CHECK: call double @llvm.experimental.constrained.exp -define double @f10() { +define double @f10() #0 { entry: %result = call double @llvm.experimental.constrained.exp.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f11 ; CHECK: call double @llvm.experimental.constrained.exp2 -define double @f11() { +define double @f11() #0 { entry: %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that log(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f12 ; CHECK: call double @llvm.experimental.constrained.log -define double @f12() { +define double @f12() #0 { entry: %result = call double @llvm.experimental.constrained.log.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that log10(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f13 ; CHECK: call double @llvm.experimental.constrained.log10 -define double @f13() { +define double @f13() #0 { entry: %result = call double @llvm.experimental.constrained.log10.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that log2(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f14 ; CHECK: call double @llvm.experimental.constrained.log2 -define double @f14() { +define double @f14() #0 { entry: %result = call double @llvm.experimental.constrained.log2.f64(double 42.0, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } ; Verify that rint(42.1) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f15 ; CHECK: call double @llvm.experimental.constrained.rint -define double @f15() { +define double @f15() #0 { entry: %result = call double @llvm.experimental.constrained.rint.f64(double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } @@ -221,12 +221,12 @@ entry: ; unknown. ; CHECK-LABEL: f16 ; CHECK: call double @llvm.experimental.constrained.nearbyint -define double @f16() { +define double @f16() #0 { entry: %result = call double @llvm.experimental.constrained.nearbyint.f64( double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } @@ -234,11 +234,11 @@ entry: ; unknown. ; CHECK-LABEL: f17 ; CHECK: call double @llvm.experimental.constrained.fma -define double @f17() { +define double @f17() #0 { entry: %result = call double @llvm.experimental.constrained.fma.f64(double 42.1, double 42.1, double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } @@ -246,11 +246,11 @@ entry: ; unknown. ; CHECK-LABEL: f18 ; CHECK: call zeroext i32 @llvm.experimental.constrained.fptoui -define zeroext i32 @f18() { +define zeroext i32 @f18() #0 { entry: %result = call zeroext i32 @llvm.experimental.constrained.fptoui.i32.f64( double 42.1, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %result } @@ -258,10 +258,10 @@ entry: ; unknown. ; CHECK-LABEL: f19 ; CHECK: call i32 @llvm.experimental.constrained.fptosi -define i32 @f19() { +define i32 @f19() #0 { entry: %result = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double 42.1, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret i32 %result } @@ -269,12 +269,12 @@ entry: ; unknown. ; CHECK-LABEL: f20 ; CHECK: call float @llvm.experimental.constrained.fptrunc -define float @f20() { +define float @f20() #0 { entry: %result = call float @llvm.experimental.constrained.fptrunc.f32.f64( double 42.1, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret float %result } @@ -282,13 +282,99 @@ entry: ; unknown. ; CHECK-LABEL: f21 ; CHECK: call double @llvm.experimental.constrained.fpext -define double @f21() { +define double @f21() #0 { entry: %result = call double @llvm.experimental.constrained.fpext.f64.f32(float 42.0, - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %result } +; Verify that lrint(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f22 +; CHECK: call i32 @llvm.experimental.constrained.lrint +define i32 @f22() #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; Verify that lrintf(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f23 +; CHECK: call i32 @llvm.experimental.constrained.lrint +define i32 @f23() #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lrint.i32.f32(float 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; Verify that llrint(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f24 +; CHECK: call i64 @llvm.experimental.constrained.llrint +define i64 @f24() #0 { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f64(double 42.1, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +; Verify that llrint(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f25 +; CHECK: call i64 @llvm.experimental.constrained.llrint +define i64 @f25() #0 { +entry: + %result = call i64 @llvm.experimental.constrained.llrint.i64.f32(float 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +; Verify that lround(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f26 +; CHECK: call i32 @llvm.experimental.constrained.lround +define i32 @f26() #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f64(double 42.1, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; Verify that lround(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f27 +; CHECK: call i32 @llvm.experimental.constrained.lround +define i32 @f27() #0 { +entry: + %result = call i32 @llvm.experimental.constrained.lround.i32.f32(float 42.0, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +; Verify that llround(42.1) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f28 +; CHECK: call i64 @llvm.experimental.constrained.llround +define i64 @f28() #0 { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f64(double 42.1, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +; Verify that llround(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: f29 +; CHECK: call i64 @llvm.experimental.constrained.llround +define i64 @f29() #0 { +entry: + %result = call i64 @llvm.experimental.constrained.llround.i64.f32(float 42.0, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +attributes #0 = { strictfp } + @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) @@ -311,3 +397,11 @@ declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) declare i32 @llvm.experimental.constrained.fptoui.i32.f64(double, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f64(double, metadata, metadata) +declare i32 @llvm.experimental.constrained.lrint.i32.f32(float, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f64(double, metadata, metadata) +declare i64 @llvm.experimental.constrained.llrint.i64.f32(float, metadata, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f64(double, metadata) +declare i32 @llvm.experimental.constrained.lround.i32.f32(float, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f64(double, metadata) +declare i64 @llvm.experimental.constrained.llround.i64.f32(float, metadata) diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll new file mode 100644 index 00000000000000..63de8663e0775f --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll @@ -0,0 +1,38 @@ +; Make sure MSan handles llvm.launder.invariant.group correctly. + +; RUN: opt < %s -msan -msan-kernel=1 -O1 -S | FileCheck -check-prefixes=CHECK %s +; RUN: opt < %s -msan -O1 -S | FileCheck -check-prefixes=CHECK %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.Foo = type { i32 (...)** } +@flag = dso_local local_unnamed_addr global i8 0, align 1 + +define dso_local %class.Foo* @_Z1fv() local_unnamed_addr #0 { +entry: + %p = alloca i8*, align 8 + %0 = bitcast i8** %p to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) + %1 = load i8, i8* @flag, align 1 + %tobool = icmp ne i8 %1, 0 + %call = call zeroext i1 @_Z2f1PPvb(i8** nonnull %p, i1 zeroext %tobool) + %2 = load i8*, i8** %p, align 8 + %3 = call i8* @llvm.launder.invariant.group.p0i8(i8* %2) + %4 = bitcast i8* %3 to %class.Foo* + %retval.0 = select i1 %call, %class.Foo* %4, %class.Foo* null + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) + ret %class.Foo* %retval.0 +} + +; CHECK-NOT: call void @__msan_warning_noreturn + +declare dso_local zeroext i1 @_Z2f1PPvb(i8**, i1 zeroext) local_unnamed_addr + +declare i8* @llvm.launder.invariant.group.p0i8(i8*) + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +attributes #0 = { sanitize_memory uwtable } diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll new file mode 100644 index 00000000000000..f3b5c0d722c8a4 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll @@ -0,0 +1,21 @@ +; Make sure MSan handles llvm.launder.invariant.group correctly. + +; RUN: opt < %s -msan -msan-kernel=1 -O1 -S | FileCheck -check-prefixes=CHECK %s +; RUN: opt < %s -msan -O1 -S | FileCheck -check-prefixes=CHECK %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@flag = dso_local local_unnamed_addr global i8 0, align 1 + +define dso_local i8* @f(i8* %x) local_unnamed_addr #0 { +entry: + %0 = call i8* @llvm.strip.invariant.group.p0i8(i8* %x) + ret i8* %0 +} + +; CHECK-NOT: call void @__msan_warning_noreturn + +declare i8* @llvm.strip.invariant.group.p0i8(i8*) + +attributes #0 = { sanitize_memory uwtable } diff --git a/llvm/test/MC/AArch64/armv8.2a-bfc.s b/llvm/test/MC/AArch64/armv8.2a-bfc.s new file mode 100644 index 00000000000000..1346850d401c85 --- /dev/null +++ b/llvm/test/MC/AArch64/armv8.2a-bfc.s @@ -0,0 +1,11 @@ +// RUN: llvm-mc -triple aarch64-linux-gnu -o - %s | FileCheck %s --check-prefix=BFI +// RUN: llvm-mc -triple aarch64-linux-gnu -o - %s -mattr=+v8.1a | FileCheck %s --check-prefix=BFI +// RUN: llvm-mc -triple aarch64-linux-gnu -o - %s -mattr=-v8.2a | FileCheck %s --check-prefix=BFI +// RUN: llvm-mc -triple aarch64-linux-gnu -o - %s -mattr=+v8.2a | FileCheck %s --check-prefix=BFC +// RUN: llvm-mc -triple aarch64-linux-gnu -o - %s -mattr=+v8.3a | FileCheck %s --check-prefix=BFC +// RUN: llvm-mc -triple aarch64-linux-gnu -o - %s -mattr=+v8.4a | FileCheck %s --check-prefix=BFC +// RUN: llvm-mc -triple aarch64-linux-gnu -o - %s -mattr=+v8.5a | FileCheck %s --check-prefix=BFC + bfc w0, #1, #5 + +// BFI: bfi w0, wzr, #1, #5 +// BFC: bfc w0, #1, #5 diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s index 07afdba8c378da..0cb8e3eb2079f2 100644 --- a/llvm/test/MC/AArch64/basic-a64-instructions.s +++ b/llvm/test/MC/AArch64/basic-a64-instructions.s @@ -978,7 +978,7 @@ _func: bfm x5, x6, #12, #63 // CHECK: bfi x4, x5, #52, #11 // encoding: [0xa4,0x28,0x4c,0xb3] // CHECK: bfxil xzr, x4, #0, #1 // encoding: [0x9f,0x00,0x40,0xb3] -// CHECK: bfc x4, #1, #6 // encoding: [0xe4,0x17,0x7f,0xb3] +// CHECK: bfi x4, xzr, #1, #6 // encoding: [0xe4,0x17,0x7f,0xb3] // CHECK: bfxil x5, x6, #12, #52 // encoding: [0xc5,0xfc,0x4c,0xb3] sxtb w1, w2 @@ -1078,7 +1078,7 @@ _func: // CHECK: bfxil w9, w10, #0, #32 // encoding: [0x49,0x7d,0x00,0x33] // CHECK: bfi w11, w12, #31, #1 // encoding: [0x8b,0x01,0x01,0x33] // CHECK: bfi w13, w14, #29, #3 // encoding: [0xcd,0x09,0x03,0x33] -// CHECK: bfc xzr, #10, #11 // encoding: [0xff,0x2b,0x76,0xb3] +// CHECK: bfi xzr, xzr, #10, #11 // encoding: [0xff,0x2b,0x76,0xb3] bfxil w9, w10, #0, #1 bfxil x2, x3, #63, #1 @@ -1137,10 +1137,10 @@ _func: bfc wzr, #31, #1 bfc x0, #5, #9 bfc xzr, #63, #1 -// CHECK: bfc w3, #0, #32 // encoding: [0xe3,0x7f,0x00,0x33] -// CHECK: bfc wzr, #31, #1 // encoding: [0xff,0x03,0x01,0x33] -// CHECK: bfc x0, #5, #9 // encoding: [0xe0,0x23,0x7b,0xb3] -// CHECK: bfc xzr, #63, #1 // encoding: [0xff,0x03,0x41,0xb3] +// CHECK: bfxil w3, wzr, #0, #32 // encoding: [0xe3,0x7f,0x00,0x33] +// CHECK: bfi wzr, wzr, #31, #1 // encoding: [0xff,0x03,0x01,0x33] +// CHECK: bfi x0, xzr, #5, #9 // encoding: [0xe0,0x23,0x7b,0xb3] +// CHECK: bfi xzr, xzr, #63, #1 // encoding: [0xff,0x03,0x41,0xb3] //------------------------------------------------------------------------------ // Compare & branch (immediate) diff --git a/llvm/test/MC/AMDGPU/flat-scratch-instructions.s b/llvm/test/MC/AMDGPU/flat-scratch-instructions.s index 3d1339023f420d..c0e1670a6bd4fa 100644 --- a/llvm/test/MC/AMDGPU/flat-scratch-instructions.s +++ b/llvm/test/MC/AMDGPU/flat-scratch-instructions.s @@ -115,6 +115,26 @@ scratch_load_dword v1, v2, off offset:-4097 // GFX9-ERR: :32: error: expected a 13-bit signed offset // VI-ERR: :32: error: not a valid operand. +scratch_load_dword v0, v1, off offset:-2049 glc slc +// GFX10-ERR: :32: error: expected a 12-bit signed offset +// GFX9: scratch_load_dword v0, v1, off offset:-2049 glc slc ; encoding: [0xff,0x57,0x53,0xdc,0x01,0x00,0x7f,0x00] +// VI-ERR: :32: error: not a valid operand. + +scratch_load_dword v0, v1, off offset:-2048 glc slc +// GFX10: scratch_load_dword v0, v1, off offset:-2048 glc slc ; encoding: [0x00,0x48,0x33,0xdc,0x01,0x00,0x7d,0x00] +// GFX9: scratch_load_dword v0, v1, off offset:-2048 glc slc ; encoding: [0x00,0x58,0x53,0xdc,0x01,0x00,0x7f,0x00] +// VI-ERR: :32: error: not a valid operand. + +scratch_load_dword v255, off, s1 offset:2047 +// GFX10: scratch_load_dword v255, off, s1 offset:2047 ; encoding: [0xff,0x47,0x30,0xdc,0x00,0x00,0x01,0xff] +// GFX9: scratch_load_dword v255, off, s1 offset:2047 ; encoding: [0xff,0x47,0x50,0xdc,0x00,0x00,0x01,0xff] +// VI-ERR: :34: error: not a valid operand. + +scratch_load_dword v255, off, s0 offset:2048 +// GFX10-ERR: :34: error: expected a 12-bit signed offset +// GFX9: scratch_load_dword v255, off, s0 offset:2048 ; encoding: [0x00,0x48,0x50,0xdc,0x00,0x00,0x00,0xff] +// VI-ERR: :34: error: not a valid operand. + scratch_store_byte v1, v2, off // GFX10: encoding: [0x00,0x40,0x60,0xdc,0x01,0x02,0x7d,0x00] // GFX9: scratch_store_byte v1, v2, off ; encoding: [0x00,0x40,0x60,0xdc,0x01,0x02,0x7f,0x00] diff --git a/llvm/test/MC/COFF/symidx.s b/llvm/test/MC/COFF/symidx.s index 46c00334764a3b..5c1e740283c844 100644 --- a/llvm/test/MC/COFF/symidx.s +++ b/llvm/test/MC/COFF/symidx.s @@ -8,8 +8,8 @@ bar: .symidx bar .symidx foo -// CHECK: Contents of section .data: -// CHECK-NEXT: 0000 0[[BAR:[1-9]]]000000 0[[FOO:[1-9]]]000000 // CHECK: SYMBOL TABLE: -// CHECK: [ [[FOO]]](sec 1)(fl 0x00)(ty 0)(scl 3) (nx 0) 0x00000000 foo -// CHECK-NEXT: [ [[BAR]]](sec 1)(fl 0x00)(ty 0)(scl 3) (nx 0) 0x00000001 bar +// CHECK: [ [[FOO:[1-9]]]](sec 1)(fl 0x00)(ty 0)(scl 3) (nx 0) 0x00000000 foo +// CHECK-NEXT: [ [[BAR:[1-9]]]](sec 1)(fl 0x00)(ty 0)(scl 3) (nx 0) 0x00000001 bar +// CHECK: Contents of section .data: +// CHECK-NEXT: 0000 0[[BAR]]000000 0[[FOO]]000000 diff --git a/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt index 74edc4c4a545a4..482dd1f68edca8 100644 --- a/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt +++ b/llvm/test/MC/Disassembler/AArch64/basic-a64-instructions.txt @@ -1,6 +1,7 @@ # RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s # RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s # RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8,+fullfp16 -disassemble < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FP16 +# RUN: llvm-mc -triple=arm64 -mattr=+v8.2a -disassemble < %s | FileCheck %s --check-prefix=CHECK-V82 # RUN: llvm-mc -triple=arm64 -mattr=+v8.3a -disassemble < %s | FileCheck %s --check-prefix=CHECK-V83 #------------------------------------------------------------------------------ @@ -624,7 +625,8 @@ # CHECK: bfi x4, x5, #52, #11 # CHECK: bfxil xzr, x4, #0, #1 -# CHECK: bfc x4, #1, #6 +# CHECK: bfi x4, xzr, #1, #6 +# CHECK-V82: bfc x4, #1, #6 # CHECK: bfxil x5, x6, #12, #52 0xa4 0x28 0x4c 0xb3 0x9f 0x0 0x40 0xb3 @@ -717,7 +719,8 @@ # CHECK: bfxil w9, w10, #0, #32 # CHECK: bfi w11, w12, #31, #1 # CHECK: bfi w13, w14, #29, #3 -# CHECK: bfc xzr, #10, #11 +# CHECK-V82: bfc xzr, #10, #11 +# CHECK: bfi xzr, xzr, #10, #11 0x49 0x1 0x0 0x33 0x62 0x0 0x41 0xb3 0x93 0xfe 0x40 0xb3 diff --git a/llvm/test/MC/Disassembler/AMDGPU/flat_gfx10.txt b/llvm/test/MC/Disassembler/AMDGPU/flat_gfx10.txt new file mode 100644 index 00000000000000..6fc105f6e5949a --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/flat_gfx10.txt @@ -0,0 +1,75 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -disassemble -show-encoding < %s | FileCheck %s + +#===------------------------------------------------------------------------===# +# scratch_load +#===------------------------------------------------------------------------===# + +# CHECK: scratch_load_dword v1, v255, off offset:-1 glc dlc ; encoding: [0xff,0x5f,0x31,0xdc,0xff,0x00,0x7d,0x01] +0xff 0x5f 0x31 0xdc 0xff 0x00 0x7d 0x01 + +# CHECK: scratch_load_dword v5, v255, off offset:-1 glc slc ; encoding: [0xff,0x4f,0x33,0xdc,0xff,0x00,0x7d,0x05] +0xff 0x4f 0x33 0xdc 0xff 0x00 0x7d 0x05 + +# CHECK: scratch_load_dword v0, v1, off offset:-2048 glc slc dlc ; encoding: [0x00,0x58,0x33,0xdc,0x01,0x00,0x7d,0x00] +0x00 0x58 0x33 0xdc 0x01 0x00 0x7d 0x00 + +# CHECK: scratch_load_dword v255, off, s105 offset:2047 dlc ; encoding: [0xff,0x57,0x30,0xdc,0x00,0x00,0x69,0xff] +0xff 0x57 0x30 0xdc 0x00 0x00 0x69 0xff + +# CHECK: scratch_load_dword v255, v2, off ; encoding: [0x00,0x40,0x30,0xdc,0x02,0x00,0x7d,0xff] +0x00 0x40 0x30 0xdc 0x02 0x00 0x7d 0xff + +# CHECK: scratch_load_dword v5, v0, off dlc ; encoding: [0x00,0x50,0x30,0xdc,0x00,0x00,0x7d,0x05] +0x00 0x50 0x30 0xdc 0x00 0x00 0x7d 0x05 + +# CHECK: scratch_load_dword v5, v3, off slc ; encoding: [0x00,0x40,0x32,0xdc,0x03,0x00,0x7d,0x05] +0x00 0x40 0x32 0xdc 0x03 0x00 0x7d 0x05 + +# CHECK: scratch_load_dword v5, v255, off slc dlc ; encoding: [0x00,0x50,0x32,0xdc,0xff,0x00,0x7d,0x05] +0x00 0x50 0x32 0xdc 0xff 0x00 0x7d 0x05 + +# CHECK: scratch_load_dword v255, off, s2 offset:1 ; encoding: [0x01,0x40,0x30,0xdc,0x00,0x00,0x02,0xff] +0x01 0x40 0x30 0xdc 0x00 0x00 0x02 0xff + +#===------------------------------------------------------------------------===# +# scratch_store +#===------------------------------------------------------------------------===# + +# CHECK: scratch_store_dword off, v2, s3 offset:-1 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x03,0x00] +0xff,0x4f,0x70,0xdc,0x00,0x02,0x03,0x00 + +# CHECK: scratch_store_dword off, v255, s3 offset:-1 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0xff,0x03,0x00] +0xff,0x4f,0x70,0xdc,0x00,0xff,0x03,0x00 + +# CHECK: scratch_store_dword off, v2, s105 offset:-1 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x69,0x00] +0xff,0x4f,0x70,0xdc,0x00,0x02,0x69,0x00 + +# CHECK: scratch_store_dword off, v2, vcc_lo offset:-1 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x6a,0x00] +0xff,0x4f,0x70,0xdc,0x00,0x02,0x6a,0x00 + +# CHECK: scratch_store_dword off, v2, vcc_hi offset:-1 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x6b,0x00] +0xff,0x4f,0x70,0xdc,0x00,0x02,0x6b,0x00 + +# CHECK: scratch_store_dword off, v2, ttmp15 offset:-1 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x7b,0x00] +0xff,0x4f,0x70,0xdc,0x00,0x02,0x7b,0x00 + +# CHECK: scratch_store_dword v0, v2, off offset:-1 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x7d,0x00] +0xff,0x4f,0x70,0xdc,0x00,0x02,0x7d,0x00 + +# CHECK: scratch_store_dword off, v2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x03,0x00] +0x00,0x40,0x70,0xdc,0x00,0x02,0x03,0x00 + +# CHECK: scratch_store_dword off, v2, s3 offset:2047 ; encoding: [0xff,0x47,0x70,0xdc,0x00,0x02,0x03,0x00] +0xff,0x47,0x70,0xdc,0x00,0x02,0x03,0x00 + +# CHECK: scratch_store_dword off, v2, s3 offset:-2048 ; encoding: [0x00,0x48,0x70,0xdc,0x00,0x02,0x03,0x00] +0x00,0x48,0x70,0xdc,0x00,0x02,0x03,0x00 + +# CHECK: scratch_store_dword off, v2, s3 offset:-1 glc ; encoding: [0xff,0x4f,0x71,0xdc,0x00,0x02,0x03,0x00] +0xff,0x4f,0x71,0xdc,0x00,0x02,0x03,0x00 + +# CHECK: scratch_store_dword off, v2, s3 offset:-1 slc ; encoding: [0xff,0x4f,0x72,0xdc,0x00,0x02,0x03,0x00] +0xff,0x4f,0x72,0xdc,0x00,0x02,0x03,0x00 + +# CHECK: scratch_store_dword off, v2, s3 offset:-1 dlc ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0x03,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/sop2_gfx10.txt b/llvm/test/MC/Disassembler/AMDGPU/sop2_gfx10.txt new file mode 100644 index 00000000000000..adeb47aa9d4d6d --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/sop2_gfx10.txt @@ -0,0 +1,7 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX10 + +# GFX10: s_ashr_i64 s[0:1], null, s0 ; encoding: [0x7d,0x00,0x80,0x91] +0x7d,0x00,0x80,0x91 + +# GFX10: s_and_b64 s[0:1], null, null ; encoding: [0x7d,0x7d,0x80,0x87] +0x7d,0x7d,0x80,0x87 diff --git a/llvm/test/MC/Disassembler/AMDGPU/vcmp-gfx10.txt b/llvm/test/MC/Disassembler/AMDGPU/vcmp-gfx10.txt new file mode 100644 index 00000000000000..9dfa8606da42e7 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/vcmp-gfx10.txt @@ -0,0 +1,6 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W32 %s +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W64 %s + +# W32: v_cmp_class_f16_sdwa ttmp14, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0xfa,0x06,0x06] +# W64: v_cmp_class_f16_sdwa ttmp[14:15], v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0xfa,0x06,0x06] +0xf9,0x04,0x1e,0x7d,0x01,0xfa,0x06,0x06 diff --git a/llvm/test/MC/Mips/crc/module-crc.s b/llvm/test/MC/Mips/crc/module-crc.s index 92c428e67ff9f8..66c54647cf448d 100644 --- a/llvm/test/MC/Mips/crc/module-crc.s +++ b/llvm/test/MC/Mips/crc/module-crc.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r6 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module crc diff --git a/llvm/test/MC/Mips/crc/module-nocrc.s b/llvm/test/MC/Mips/crc/module-nocrc.s index c67279194c8e13..193ed360b57447 100644 --- a/llvm/test/MC/Mips/crc/module-nocrc.s +++ b/llvm/test/MC/Mips/crc/module-nocrc.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r6 -filetype=obj -o - -mattr=+crc | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module nocrc diff --git a/llvm/test/MC/Mips/ginv/module-ginv.s b/llvm/test/MC/Mips/ginv/module-ginv.s index 07f1bc4d40e9ee..8adcd90b23f79c 100644 --- a/llvm/test/MC/Mips/ginv/module-ginv.s +++ b/llvm/test/MC/Mips/ginv/module-ginv.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r6 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module ginv diff --git a/llvm/test/MC/Mips/ginv/module-noginv.s b/llvm/test/MC/Mips/ginv/module-noginv.s index 2ed4fd9c314b7b..611d72c52d5610 100644 --- a/llvm/test/MC/Mips/ginv/module-noginv.s +++ b/llvm/test/MC/Mips/ginv/module-noginv.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r6 -filetype=obj -o - -mattr=+ginv | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module noginv diff --git a/llvm/test/MC/Mips/micromips-ase-directive.s b/llvm/test/MC/Mips/micromips-ase-directive.s index f3ac60057dc5e7..fef40ecc3eeb55 100644 --- a/llvm/test/MC/Mips/micromips-ase-directive.s +++ b/llvm/test/MC/Mips/micromips-ase-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=mips-unknown-linux -filetype=obj %s -o - | \ -# RUN: llvm-readobj --mips-abi-flags | \ +# RUN: llvm-readobj -A | \ # RUN: FileCheck --check-prefix=ASE-MICROMIPS %s .set micromips diff --git a/llvm/test/MC/Mips/micromips-jump-pc-region.s b/llvm/test/MC/Mips/micromips-jump-pc-region.s new file mode 100644 index 00000000000000..5f598fc016ffe1 --- /dev/null +++ b/llvm/test/MC/Mips/micromips-jump-pc-region.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc -triple=mips -mcpu=mips32 -mattr=+micromips -filetype=obj < %s \ +# RUN: | llvm-objdump -d - | FileCheck %s + +.set noreorder + +# Force us into the second 256 MB region with a non-zero instruction index +.org 256*1024*1024 + 12 +# CHECK-LABEL: 1000000c foo: +# CHECK-NEXT: 1000000c: d4 00 00 06 j 12 +# CHECK-NEXT: 10000010: f4 00 00 08 jal 16 +# CHECK-NEXT: 10000014: f0 00 00 05 jalx 20 +# CHECK-NEXT: 10000018: 74 00 00 0c jals 24 +foo: + j 12 + jal 16 + jalx 20 + jals 24 diff --git a/llvm/test/MC/Mips/micromips32r6/relocations.s b/llvm/test/MC/Mips/micromips32r6/relocations.s index 7e8f3f6107e6a5..615b445a0faad3 100644 --- a/llvm/test/MC/Mips/micromips32r6/relocations.s +++ b/llvm/test/MC/Mips/micromips32r6/relocations.s @@ -26,6 +26,12 @@ # CHECK-FIXUP: bnezc $3, bar # encoding: [0xa0,0b011AAAAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar-4, kind: fixup_MICROMIPS_PC21_S1 +# CHECK-FIXUP: jialc $5, bar # encoding: [0x80,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_MICROMIPS_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xa0,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_MICROMIPS_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -37,6 +43,8 @@ # CHECK-ELF: 0x10 R_MICROMIPS_PC19_S2 bar 0x0 # CHECK-ELF: 0x14 R_MICROMIPS_PC21_S1 bar 0x0 # CHECK-ELF: 0x18 R_MICROMIPS_PC21_S1 bar 0x0 +# CHECK-ELF: 0x1C R_MICROMIPS_LO16 bar 0x0 +# CHECK-ELF: 0x20 R_MICROMIPS_LO16 bar 0x0 # CHECK-ELF: ] balc bar @@ -46,3 +54,5 @@ lwpc $2,bar beqzc $3, bar bnezc $3, bar + jialc $5, bar + jic $5, bar diff --git a/llvm/test/MC/Mips/mips-jump-pc-region.s b/llvm/test/MC/Mips/mips-jump-pc-region.s new file mode 100644 index 00000000000000..2d6bbce3a492b2 --- /dev/null +++ b/llvm/test/MC/Mips/mips-jump-pc-region.s @@ -0,0 +1,17 @@ +# RUN: llvm-mc -triple=mips -mcpu=mips32 -filetype=obj < %s \ +# RUN: | llvm-objdump -d - | FileCheck %s +# RUN: llvm-mc -triple=mips64 -mcpu=mips64 -filetype=obj < %s \ +# RUN: | llvm-objdump -d - | FileCheck %s + +.set noreorder + +# Force us into the second 256 MB region with a non-zero instruction index +.org 256*1024*1024 + 12 +# CHECK-LABEL: 1000000c foo: +# CHECK-NEXT: 1000000c: 08 00 00 03 j 12 +# CHECK-NEXT: 10000010: 0c 00 00 04 jal 16 +# CHECK-NEXT: 10000014: 74 00 00 05 jalx 20 +foo: + j 12 + jal 16 + jalx 20 diff --git a/llvm/test/MC/Mips/mips32r6/relocations.s b/llvm/test/MC/Mips/mips32r6/relocations.s index 3f42ee8f471749..8095fb156ec93f 100644 --- a/llvm/test/MC/Mips/mips32r6/relocations.s +++ b/llvm/test/MC/Mips/mips32r6/relocations.s @@ -40,6 +40,12 @@ # CHECK-FIXUP: lwpc $2, bar # encoding: [0xec,0b01001AAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar, kind: fixup_MIPS_PC19_S2 +# CHECK-FIXUP: jialc $5, bar # encoding: [0xf8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xd8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -55,6 +61,8 @@ # CHECK-ELF: 0x20 R_MIPS_PCLO16 bar 0x0 # CHECK-ELF: 0x24 R_MIPS_PC19_S2 bar 0x0 # CHECK-ELF: 0x28 R_MIPS_PC19_S2 bar 0x0 +# CHECK-ELF: 0x2C R_MIPS_LO16 bar 0x0 +# CHECK-ELF: 0x30 R_MIPS_LO16 bar 0x0 # CHECK-ELF: ] addiupc $2,bar @@ -68,3 +76,5 @@ addiu $2, $2, %pcrel_lo(bar) lapc $2,bar lwpc $2,bar + jialc $5, bar + jic $5, bar diff --git a/llvm/test/MC/Mips/mips64r6/relocations.s b/llvm/test/MC/Mips/mips64r6/relocations.s index 4f4efda07c6953..5e70f44b96e1aa 100644 --- a/llvm/test/MC/Mips/mips64r6/relocations.s +++ b/llvm/test/MC/Mips/mips64r6/relocations.s @@ -47,6 +47,12 @@ # CHECK-FIXUP: lwupc $2, bar # encoding: [0xec,0b01010AAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar, kind: fixup_MIPS_PC19_S2 +# CHECK-FIXUP: jialc $5, bar # encoding: [0xf8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xd8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -64,6 +70,8 @@ # CHECK-ELF: 0x28 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: 0x2C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: 0x30 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x34 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x38 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: ] addiupc $2,bar @@ -79,3 +87,5 @@ ldpc $2,bar lwpc $2,bar lwupc $2,bar + jialc $5, bar + jic $5, bar diff --git a/llvm/test/MC/Mips/mips_abi_flags_xx.s b/llvm/test/MC/Mips/mips_abi_flags_xx.s index 94101ae0c8f5a0..f8386b49774ff7 100644 --- a/llvm/test/MC/Mips/mips_abi_flags_xx.s +++ b/llvm/test/MC/Mips/mips_abi_flags_xx.s @@ -2,19 +2,19 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mattr=fpxx -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=fpxx -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R6,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips64-unknown-linux-gnu -mcpu=octeon -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-64R2,CHECK-OBJ-OCTEON # CHECK-ASM: .module fp=xx diff --git a/llvm/test/MC/Mips/mips_abi_flags_xx_set.s b/llvm/test/MC/Mips/mips_abi_flags_xx_set.s index f2445eba7774d2..8e4e2dbcf5343a 100644 --- a/llvm/test/MC/Mips/mips_abi_flags_xx_set.s +++ b/llvm/test/MC/Mips/mips_abi_flags_xx_set.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module fp=xx diff --git a/llvm/test/MC/Mips/module-hardfloat.s b/llvm/test/MC/Mips/module-hardfloat.s index f29fbc09353c22..5738a09a91b953 100644 --- a/llvm/test/MC/Mips/module-hardfloat.s +++ b/llvm/test/MC/Mips/module-hardfloat.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module hardfloat diff --git a/llvm/test/MC/Mips/module-softfloat.s b/llvm/test/MC/Mips/module-softfloat.s index 77e62e38e2014b..94ab7be63dccb1 100644 --- a/llvm/test/MC/Mips/module-softfloat.s +++ b/llvm/test/MC/Mips/module-softfloat.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module softfloat diff --git a/llvm/test/MC/Mips/mt/abiflag.s b/llvm/test/MC/Mips/mt/abiflag.s index 2d03c5d1106cfc..d067c55587c913 100644 --- a/llvm/test/MC/Mips/mt/abiflag.s +++ b/llvm/test/MC/Mips/mt/abiflag.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -mattr=+mt -filetype=obj -o - \ -# RUN: | llvm-readobj --mips-abi-flags | FileCheck %s +# RUN: | llvm-readobj -A | FileCheck %s # Test that the usage of the MT ASE is recorded in .MIPS.abiflags diff --git a/llvm/test/MC/Mips/mt/module-directive.s b/llvm/test/MC/Mips/mt/module-directive.s index 0d9ab97b4550da..1bbe91147545ea 100644 --- a/llvm/test/MC/Mips/mt/module-directive.s +++ b/llvm/test/MC/Mips/mt/module-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags | FileCheck --check-prefix=CHECK-OBJ %s +# RUN: llvm-readobj -A | FileCheck --check-prefix=CHECK-OBJ %s # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ # RUN: FileCheck --check-prefix=CHECK-ASM %s diff --git a/llvm/test/MC/Mips/mt/set-directive.s b/llvm/test/MC/Mips/mt/set-directive.s index 9088655d8c5d71..5d18486059d496 100644 --- a/llvm/test/MC/Mips/mt/set-directive.s +++ b/llvm/test/MC/Mips/mt/set-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags | FileCheck %s --check-prefix=CHECK-OBJ +# RUN: llvm-readobj -A | FileCheck %s --check-prefix=CHECK-OBJ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ # RUN: FileCheck %s --check-prefix=CHECK-ASM diff --git a/llvm/test/MC/Mips/virt/module-novirt.s b/llvm/test/MC/Mips/virt/module-novirt.s index 0f531dbbc80be2..6b953d0c58576e 100644 --- a/llvm/test/MC/Mips/virt/module-novirt.s +++ b/llvm/test/MC/Mips/virt/module-novirt.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r5 -filetype=obj -o - -mattr=+virt | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module novirt diff --git a/llvm/test/MC/Mips/virt/module-virt.s b/llvm/test/MC/Mips/virt/module-virt.s index ae38b83d8486a8..1fb035df8783f6 100644 --- a/llvm/test/MC/Mips/virt/module-virt.s +++ b/llvm/test/MC/Mips/virt/module-virt.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r5 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module virt diff --git a/llvm/test/MC/RISCV/rvf-aliases-valid.s b/llvm/test/MC/RISCV/rvf-aliases-valid.s index 725dbe6d6a2c38..0d8179ff31f92c 100644 --- a/llvm/test/MC/RISCV/rvf-aliases-valid.s +++ b/llvm/test/MC/RISCV/rvf-aliases-valid.s @@ -55,6 +55,18 @@ fscsr x6, x7 # CHECK-ALIAS: fscsr t3 fscsr x28 +# These are obsolete aliases of frcsr/fscsr. They are accepted by the assembler +# but the disassembler should always print them as the equivalent, new aliases. +# CHECK-INST: csrrs t4, fcsr, zero +# CHECK-ALIAS: frcsr t4 +frsr x29 +# CHECK-INST: csrrw t5, fcsr, t6 +# CHECK-ALIAS: fscsr t5, t6 +fssr x30, x31 +# CHECK-INST: csrrw zero, fcsr, s0 +# CHECK-ALIAS: fscsr s0 +fssr x8 + # CHECK-INST: csrrs t4, frm, zero # CHECK-ALIAS: frrm t4 frrm x29 diff --git a/llvm/test/Object/Mips/abi-flags.yaml b/llvm/test/Object/Mips/abi-flags.yaml index b5142fd3303acc..ce8234a9a0dbef 100644 --- a/llvm/test/Object/Mips/abi-flags.yaml +++ b/llvm/test/Object/Mips/abi-flags.yaml @@ -1,5 +1,5 @@ # RUN: yaml2obj %s > %t -# RUN: llvm-readobj --mips-abi-flags %t | FileCheck -check-prefix=OBJ %s +# RUN: llvm-readobj -A %t | FileCheck -check-prefix=OBJ %s # RUN: obj2yaml %t | FileCheck -check-prefix=YAML %s # OBJ: MIPS ABI Flags { diff --git a/llvm/test/ThinLTO/X86/Inputs/devirt_promote.ll b/llvm/test/ThinLTO/X86/Inputs/devirt_promote.ll new file mode 100644 index 00000000000000..bcd49aefe15850 --- /dev/null +++ b/llvm/test/ThinLTO/X86/Inputs/devirt_promote.ll @@ -0,0 +1,39 @@ +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +%struct.A = type { i32 (...)** } +%struct.B = type { %struct.A } + +@_ZTV1B = constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* undef, i8* bitcast (i32 (%struct.B*, i32)* @_ZN1B1fEi to i8*), i8* bitcast (i32 (%struct.A*, i32)* @_ZN1A1nEi to i8*)] }, !type !0, !type !1 + +define i32 @_ZN1B1fEi(%struct.B* %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1A1nEi(%struct.A* %this, i32 %a) #0 { + ret i32 0; +} + +define i32 @test2(%struct.B* %obj, i32 %a) { +entry: + %0 = bitcast %struct.B* %obj to i8*** + %vtable2 = load i8**, i8*** %0 + %1 = bitcast i8** %vtable2 to i8* + %p2 = call i1 @llvm.type.test(i8* %1, metadata !"_ZTS1B") + call void @llvm.assume(i1 %p2) + + %fptrptr = getelementptr i8*, i8** %vtable2, i32 1 + %2 = bitcast i8** %fptrptr to i32 (%struct.B*, i32)** + %fptr33 = load i32 (%struct.B*, i32)*, i32 (%struct.B*, i32)** %2, align 8 + + %call4 = tail call i32 %fptr33(%struct.B* nonnull %obj, i32 %a) + ret i32 %call4 +} + +attributes #0 = { noinline optnone } + +declare i1 @llvm.type.test(i8*, metadata) +declare void @llvm.assume(i1) + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTS1B"} diff --git a/llvm/test/ThinLTO/X86/devirt_promote.ll b/llvm/test/ThinLTO/X86/devirt_promote.ll new file mode 100644 index 00000000000000..563ed994157a29 --- /dev/null +++ b/llvm/test/ThinLTO/X86/devirt_promote.ll @@ -0,0 +1,72 @@ +; REQUIRES: x86-registered-target + +; Test devirtualization requiring promotion of local targets, where the +; promotion is required by one devirtualization and needs to be updated +; for a second devirtualization in the defining module as a post-pass +; update. + +; Generate unsplit module with summary for ThinLTO index-based WPD. +; RUN: opt -thinlto-bc -o %t3.o %s +; RUN: opt -thinlto-bc -o %t4.o %p/Inputs/devirt_promote.ll + +; RUN: llvm-lto2 run %t3.o %t4.o -save-temps -use-new-pm -pass-remarks=. \ +; RUN: -wholeprogramdevirt-print-index-based \ +; RUN: -o %t5 \ +; RUN: -r=%t3.o,test,px \ +; RUN: -r=%t4.o,_ZN1B1fEi,p \ +; RUN: -r=%t4.o,test2,px \ +; RUN: -r=%t4.o,_ZTV1B,px \ +; RUN: 2>&1 | FileCheck %s --check-prefix=REMARK --check-prefix=PRINT +; RUN: llvm-dis %t5.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR1 +; RUN: llvm-dis %t5.2.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR2 +; RUN: llvm-nm %t5.1 | FileCheck %s --check-prefix=NM-INDEX1 +; RUN: llvm-nm %t5.2 | FileCheck %s --check-prefix=NM-INDEX2 + +; NM-INDEX1: U _ZN1A1nEi.llvm. + +; Make sure that not only did _ZN1A1nEi get promoted (due to the +; devirtualization in the other module) but the reference due to the +; devirtualization in its defining module should be to the promoted +; symbol. +; NM-INDEX2-NOT: U _ZN1A1nEi +; NM-INDEX2: T _ZN1A1nEi.llvm. +; NM-INDEX2-NOT: U _ZN1A1nEi + +; We should devirt call to _ZN1A1nEi once in importing module and once +; in original (exporting) module. +; REMARK-COUNT-2: single-impl: devirtualized a call to _ZN1A1nEi.llvm. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +%struct.A = type { i32 (...)** } + +; CHECK-IR1-LABEL: define i32 @test +define i32 @test(%struct.A* %obj, i32 %a) { +entry: + %0 = bitcast %struct.A* %obj to i8*** + %vtable = load i8**, i8*** %0 + %1 = bitcast i8** %vtable to i8* + %p = call i1 @llvm.type.test(i8* %1, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr i8*, i8** %vtable, i32 1 + %2 = bitcast i8** %fptrptr to i32 (%struct.A*, i32)** + %fptr1 = load i32 (%struct.A*, i32)*, i32 (%struct.A*, i32)** %2, align 8 + + ; Check that the call was devirtualized. + ; CHECK-IR1: %call = tail call i32 bitcast (void ()* @_ZN1A1nEi + %call = tail call i32 %fptr1(%struct.A* nonnull %obj, i32 %a) + + ret i32 %call +} +; CHECK-IR1-LABEL: ret i32 +; CHECK-IR1-LABEL: } + +; CHECK-IR2: define i32 @test2 +; Check that the call was devirtualized. +; CHECK-IR2: %call4 = tail call i32 @_ZN1A1nEi + +declare i1 @llvm.type.test(i8*, metadata) +declare void @llvm.assume(i1) + +attributes #0 = { noinline optnone } diff --git a/llvm/test/Transforms/DCE/calls-errno.ll b/llvm/test/Transforms/DCE/calls-errno.ll index 20ee0d06d3a9ba..376b8d1e5882f0 100644 --- a/llvm/test/Transforms/DCE/calls-errno.ll +++ b/llvm/test/Transforms/DCE/calls-errno.ll @@ -76,10 +76,6 @@ entry: ; CHECK-NEXT: %cos3 = call double @cos(double 0.000000e+00) %cos3 = call double @cos(double 0.000000e+00) nobuiltin -; cos(1) strictfp sets FP status flags -; CHECK-NEXT: %cos4 = call double @cos(double 1.000000e+00) - %cos4 = call double @cos(double 1.000000e+00) strictfp - ; pow(0, 1) is 0 %pow1 = call double @pow(double 0x7FF0000000000000, double 1.000000e+00) @@ -97,3 +93,16 @@ entry: ; CHECK-NEXT: ret void ret void } + +define void @Tstrict() strictfp { +entry: +; CHECK-LABEL: @Tstrict( +; CHECK-NEXT: entry: + +; cos(1) strictfp sets FP status flags +; CHECK-NEXT: %cos4 = call double @cos(double 1.000000e+00) + %cos4 = call double @cos(double 1.000000e+00) strictfp + +; CHECK-NEXT: ret void + ret void +} diff --git a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll index 811b50783a5c5b..fbf2061ff650ad 100644 --- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll +++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll @@ -1,3 +1,5 @@ +; REQUIRES: asserts +; RUN: opt -S -instsimplify -hotcoldsplit -debug < %s 2>&1 | FileCheck %s ; RUN: opt -instcombine -hotcoldsplit -instsimplify %s -o /dev/null target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -6,6 +8,16 @@ target triple = "aarch64" %a = type { i64, i64 } %b = type { i64 } +; CHECK: @f +; CHECK-LABEL: codeRepl: +; CHECK-NOT: @llvm.assume +; CHECK: } +; CHECK: declare {{.*}}@llvm.assume +; CHECK: define {{.*}}@f.cold.1(i64 %0) +; CHECK-LABEL: newFuncRoot: +; CHECK: %1 = icmp eq i64 %0, 0 +; CHECK: call void @llvm.assume(i1 %1) + define void @f() { entry: %0 = getelementptr inbounds %a, %a* null, i64 0, i32 1 diff --git a/llvm/test/Transforms/IndVarSimplify/loop_evaluate_1.ll b/llvm/test/Transforms/IndVarSimplify/loop_evaluate_1.ll index 6edacc17518b96..be305673357fc7 100644 --- a/llvm/test/Transforms/IndVarSimplify/loop_evaluate_1.ll +++ b/llvm/test/Transforms/IndVarSimplify/loop_evaluate_1.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -indvars -loop-deletion -simplifycfg -S | FileCheck %s @@ -28,10 +30,14 @@ define i32 @test2(i32 %arg) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP:%.*]] = icmp ugt i32 [[ARG:%.*]], 10 +; CHECK-NEXT: br i1 [[TMP]], label [[BB1_PREHEADER:%.*]], label [[BB7:%.*]] +; CHECK: bb1.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[ARG]], -11 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP]], i32 [[TMP2]], i32 0 +; CHECK-NEXT: br label [[BB7]] +; CHECK: bb7: +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP2]], [[BB1_PREHEADER]] ] ; CHECK-NEXT: ret i32 [[TMP8]] ; bb: diff --git a/llvm/test/Transforms/InstCombine/addrspacecast.ll b/llvm/test/Transforms/InstCombine/addrspacecast.ll index 6caefb166dbf50..2e34f61a662354 100644 --- a/llvm/test/Transforms/InstCombine/addrspacecast.ll +++ b/llvm/test/Transforms/InstCombine/addrspacecast.ll @@ -104,7 +104,7 @@ define <4 x float addrspace(2)*> @combine_addrspacecast_types_vector(<4 x i32 ad define i32 @canonicalize_addrspacecast([16 x i32] addrspace(1)* %arr) { ; CHECK-LABEL: @canonicalize_addrspacecast( -; CHECK-NEXT: getelementptr inbounds [16 x i32], [16 x i32] addrspace(1)* %arr, i32 0, i32 0 +; CHECK-NEXT: getelementptr [16 x i32], [16 x i32] addrspace(1)* %arr, i32 0, i32 0 ; CHECK-NEXT: addrspacecast i32 addrspace(1)* %{{[a-zA-Z0-9]+}} to i32* ; CHECK-NEXT: load i32, i32* ; CHECK-NEXT: ret i32 diff --git a/llvm/test/Transforms/InstCombine/bcopy.ll b/llvm/test/Transforms/InstCombine/bcopy.ll new file mode 100644 index 00000000000000..6a53bad7eeb097 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/bcopy.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare void @bcopy(i8* nocapture readonly, i8* nocapture, i32) + +define void @bcopy_memmove(i8* nocapture readonly %a, i8* nocapture %b) { +; CHECK-LABEL: @bcopy_memmove( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[A:%.*]] to i64* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[B:%.*]] to i64* +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1 +; CHECK-NEXT: store i64 [[TMP3]], i64* [[TMP2]], align 1 +; CHECK-NEXT: ret void +; + tail call void @bcopy(i8* %a, i8* %b, i32 8) + ret void +} + +define void @bcopy_memmove2(i8* nocapture readonly %a, i8* nocapture %b, i32 %len) { +; CHECK-LABEL: @bcopy_memmove2( +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i32(i8* align 1 [[B:%.*]], i8* align 1 [[A:%.*]], i32 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + tail call void @bcopy(i8* %a, i8* %b, i32 %len) + ret void +} diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll index b6d1eda0601dd6..fd35bd92dd7dc8 100644 --- a/llvm/test/Transforms/InstCombine/cast.ll +++ b/llvm/test/Transforms/InstCombine/cast.ll @@ -293,7 +293,7 @@ define i32 @test26(float %F) { define [4 x float]* @test27([9 x [4 x float]]* %A) { ; CHECK-LABEL: @test27( -; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [9 x [4 x float]], [9 x [4 x float]]* [[A:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[C:%.*]] = getelementptr [9 x [4 x float]], [9 x [4 x float]]* [[A:%.*]], i64 0, i64 0 ; CHECK-NEXT: ret [4 x float]* [[C]] ; %c = bitcast [9 x [4 x float]]* %A to [4 x float]* @@ -302,7 +302,7 @@ define [4 x float]* @test27([9 x [4 x float]]* %A) { define float* @test28([4 x float]* %A) { ; CHECK-LABEL: @test28( -; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[A:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[C:%.*]] = getelementptr [4 x float], [4 x float]* [[A:%.*]], i64 0, i64 0 ; CHECK-NEXT: ret float* [[C]] ; %c = bitcast [4 x float]* %A to float* diff --git a/llvm/test/Transforms/InstCombine/constant-fold-libfunc.ll b/llvm/test/Transforms/InstCombine/constant-fold-libfunc.ll index 5d1aa821ea1ba5..af33d98954479e 100644 --- a/llvm/test/Transforms/InstCombine/constant-fold-libfunc.ll +++ b/llvm/test/Transforms/InstCombine/constant-fold-libfunc.ll @@ -23,7 +23,7 @@ define double @test_acos_nobuiltin() { ; Check that we don't constant fold strictfp results that require rounding. -define double @test_acos_strictfp() { +define double @test_acos_strictfp() strictfp { ; CHECK-LABEL: @test_acos_strictfp %pi = call double @acos(double -1.000000e+00) strictfp ; CHECK: call double @acos(double -1.000000e+00) diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll index 33dc8100b58147..f123f5d56b8079 100644 --- a/llvm/test/Transforms/InstCombine/fma.ll +++ b/llvm/test/Transforms/InstCombine/fma.ll @@ -182,7 +182,7 @@ define float @fmuladd_unary_fneg_x_unary_fneg_y(float %x, float %y, float %z) { define float @fmuladd_fneg_x_fneg_y_fast(float %x, float %y, float %z) { ; CHECK-LABEL: @fmuladd_fneg_x_fneg_y_fast( -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[FMULADD:%.*]] = fadd fast float [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret float [[FMULADD]] ; @@ -194,7 +194,7 @@ define float @fmuladd_fneg_x_fneg_y_fast(float %x, float %y, float %z) { define float @fmuladd_unary_fneg_x_unary_fneg_y_fast(float %x, float %y, float %z) { ; CHECK-LABEL: @fmuladd_unary_fneg_x_unary_fneg_y_fast( -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast float [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[FMULADD:%.*]] = fadd fast float [[TMP1]], [[Z:%.*]] ; CHECK-NEXT: ret float [[FMULADD]] ; @@ -499,6 +499,129 @@ entry: ret <2 x double> %res } +define <2 x double> @fma_nan_and_const_0(<2 x double> %b) { +; CHECK-LABEL: @fma_nan_and_const_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> , <2 x double> %b) + ret <2 x double> %res +} + +define <2 x double> @fma_nan_and_const_1(<2 x double> %b) { +; CHECK-LABEL: @fma_nan_and_const_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> , <2 x double> %b) + ret <2 x double> %res +} + +define <2 x double> @fma_nan_and_const_2(<2 x double> %b) { +; CHECK-LABEL: @fma_nan_and_const_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> %b, <2 x double> ) + ret <2 x double> %res +} + +define <2 x double> @fma_undef_0(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_undef_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> %b, <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fma_undef_1(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_undef_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> , <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fma_undef_2(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_undef_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> ) + ret <2 x double> %res +} + +define <2 x double> @fma_partial_undef_0(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_partial_undef_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> , <2 x double> [[C:%.*]]) +; CHECK-NEXT: ret <2 x double> [[RES]] +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> %b, <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fma_partial_undef_1(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_partial_undef_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> , <2 x double> [[C:%.*]]) +; CHECK-NEXT: ret <2 x double> [[RES]] +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> , <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fma_partial_undef_2(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_partial_undef_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RES:%.*]] = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> ) +; CHECK-NEXT: ret <2 x double> [[RES]] +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> ) + ret <2 x double> %res +} + + +define <2 x double> @fma_nan_0(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_nan_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> , <2 x double> %b, <2 x double> %c) + ret <2 x double> %res +} +define <2 x double> @fma_nan_1(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_nan_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> , <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fma_nan_2(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fma_nan_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> ) + ret <2 x double> %res +} + define <2 x double> @fmuladd_const_fmul(<2 x double> %b) { ; CHECK-LABEL: @fmuladd_const_fmul( ; CHECK-NEXT: entry: @@ -510,6 +633,85 @@ entry: ret <2 x double> %res } +define <2 x double> @fmuladd_nan_and_const_0(<2 x double> %b) { +; CHECK-LABEL: @fmuladd_nan_and_const_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> , <2 x double> , <2 x double> %b) + ret <2 x double> %res +} + +define <2 x double> @fmuladd_nan_and_const_1(<2 x double> %b) { +; CHECK-LABEL: @fmuladd_nan_and_const_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> , <2 x double> , <2 x double> %b) + ret <2 x double> %res +} + +define <2 x double> @fmuladd_nan_and_const_2(<2 x double> %b) { +; CHECK-LABEL: @fmuladd_nan_and_const_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> , <2 x double> %b, <2 x double> ) + ret <2 x double> %res +} + +define <2 x double> @fmuladd_nan_0(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fmuladd_nan_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> , <2 x double> %b, <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fmuladd_nan_1(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fmuladd_nan_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> %b, <2 x double> , <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fmuladd_undef_0(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fmuladd_undef_0( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> , <2 x double> %b, <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fmuladd_undef_1(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fmuladd_undef_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> %b, <2 x double> , <2 x double> %c) + ret <2 x double> %res +} + +define <2 x double> @fmuladd_undef_2(<2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: @fmuladd_undef_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <2 x double> +; +entry: + %res = call nnan nsz <2 x double> @llvm.fmuladd.v2f64(<2 x double> %b, <2 x double> %c, <2 x double> ) + ret <2 x double> %res +} declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll index d448679804fe96..89c957b9d083b4 100644 --- a/llvm/test/Transforms/InstCombine/fmul.ll +++ b/llvm/test/Transforms/InstCombine/fmul.ll @@ -991,3 +991,81 @@ define double @fmul_negated_constant_expression(double %x) { %r = fmul double %x, fsub (double -0.000000e+00, double bitcast (i64 ptrtoint (i8** getelementptr inbounds ({ [2 x i8*] }, { [2 x i8*] }* @g, i64 0, inrange i32 0, i64 2) to i64) to double)) ret double %r } + +define float @negate_if_true(float %x, i1 %cond) { +; CHECK-LABEL: @negate_if_true( +; CHECK-NEXT: [[TMP1:%.*]] = fsub float -0.000000e+00, [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND:%.*]], float [[TMP1]], float [[X]] +; CHECK-NEXT: ret float [[TMP2]] +; + %sel = select i1 %cond, float -1.0, float 1.0 + %r = fmul float %sel, %x + ret float %r +} + +define float @negate_if_false(float %x, i1 %cond) { +; CHECK-LABEL: @negate_if_false( +; CHECK-NEXT: [[TMP1:%.*]] = fsub arcp float -0.000000e+00, [[X:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = select arcp i1 [[COND:%.*]], float [[X]], float [[TMP1]] +; CHECK-NEXT: ret float [[TMP2]] +; + %sel = select i1 %cond, float 1.0, float -1.0 + %r = fmul arcp float %sel, %x + ret float %r +} + +define <2 x double> @negate_if_true_commute(<2 x double> %px, i1 %cond) { +; CHECK-LABEL: @negate_if_true_commute( +; CHECK-NEXT: [[X:%.*]] = fdiv <2 x double> , [[PX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub ninf <2 x double> , [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = select ninf i1 [[COND:%.*]], <2 x double> [[TMP1]], <2 x double> [[X]] +; CHECK-NEXT: ret <2 x double> [[TMP2]] +; + %x = fdiv <2 x double> , %px ; thwart complexity-based canonicalization + %sel = select i1 %cond, <2 x double> , <2 x double> + %r = fmul ninf <2 x double> %x, %sel + ret <2 x double> %r +} + +define <2 x double> @negate_if_false_commute(<2 x double> %px, <2 x i1> %cond) { +; CHECK-LABEL: @negate_if_false_commute( +; CHECK-NEXT: [[X:%.*]] = fdiv <2 x double> , [[PX:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> , [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[COND:%.*]], <2 x double> [[X]], <2 x double> [[TMP1]] +; CHECK-NEXT: ret <2 x double> [[TMP2]] +; + %x = fdiv <2 x double> , %px ; thwart complexity-based canonicalization + %sel = select <2 x i1> %cond, <2 x double> , <2 x double> + %r = fmul <2 x double> %x, %sel + ret <2 x double> %r +} + +; Negative test + +define float @negate_if_true_extra_use(float %x, i1 %cond) { +; CHECK-LABEL: @negate_if_true_extra_use( +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], float -1.000000e+00, float 1.000000e+00 +; CHECK-NEXT: call void @use_f32(float [[SEL]]) +; CHECK-NEXT: [[R:%.*]] = fmul float [[SEL]], [[X:%.*]] +; CHECK-NEXT: ret float [[R]] +; + %sel = select i1 %cond, float -1.0, float 1.0 + call void @use_f32(float %sel) + %r = fmul float %sel, %x + ret float %r +} + +; Negative test + +define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) { +; CHECK-LABEL: @negate_if_true_wrong_constant( +; CHECK-NEXT: [[X:%.*]] = fdiv <2 x double> , [[PX:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND:%.*]], <2 x double> , <2 x double> +; CHECK-NEXT: [[R:%.*]] = fmul <2 x double> [[X]], [[SEL]] +; CHECK-NEXT: ret <2 x double> [[R]] +; + %x = fdiv <2 x double> , %px ; thwart complexity-based canonicalization + %sel = select i1 %cond, <2 x double> , <2 x double> + %r = fmul <2 x double> %x, %sel + ret <2 x double> %r +} diff --git a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll index bf1a031a412493..c9a803eb8ea5e6 100644 --- a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll +++ b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll @@ -1,8 +1,9 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s define i1 @lshrugt_01_00(i4 %x) { ; CHECK-LABEL: @lshrugt_01_00( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 1 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 1 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -12,7 +13,7 @@ define i1 @lshrugt_01_00(i4 %x) { define i1 @lshrugt_01_01(i4 %x) { ; CHECK-LABEL: @lshrugt_01_01( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 3 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 3 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -22,7 +23,7 @@ define i1 @lshrugt_01_01(i4 %x) { define i1 @lshrugt_01_02(i4 %x) { ; CHECK-LABEL: @lshrugt_01_02( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 5 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 5 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -32,7 +33,7 @@ define i1 @lshrugt_01_02(i4 %x) { define i1 @lshrugt_01_03(i4 %x) { ; CHECK-LABEL: @lshrugt_01_03( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -42,7 +43,7 @@ define i1 @lshrugt_01_03(i4 %x) { define i1 @lshrugt_01_04(i4 %x) { ; CHECK-LABEL: @lshrugt_01_04( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, -7 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], -7 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -52,7 +53,7 @@ define i1 @lshrugt_01_04(i4 %x) { define i1 @lshrugt_01_05(i4 %x) { ; CHECK-LABEL: @lshrugt_01_05( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, -5 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], -5 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -62,7 +63,7 @@ define i1 @lshrugt_01_05(i4 %x) { define i1 @lshrugt_01_06(i4 %x) { ; CHECK-LABEL: @lshrugt_01_06( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, -3 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], -3 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -153,7 +154,7 @@ define i1 @lshrugt_01_15(i4 %x) { define i1 @lshrugt_02_00(i4 %x) { ; CHECK-LABEL: @lshrugt_02_00( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 3 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 3 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 2 @@ -163,7 +164,7 @@ define i1 @lshrugt_02_00(i4 %x) { define i1 @lshrugt_02_01(i4 %x) { ; CHECK-LABEL: @lshrugt_02_01( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 2 @@ -173,7 +174,7 @@ define i1 @lshrugt_02_01(i4 %x) { define i1 @lshrugt_02_02(i4 %x) { ; CHECK-LABEL: @lshrugt_02_02( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, -5 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], -5 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 2 @@ -300,7 +301,7 @@ define i1 @lshrugt_02_15(i4 %x) { define i1 @lshrugt_03_00(i4 %x) { ; CHECK-LABEL: @lshrugt_03_00( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 3 @@ -454,7 +455,7 @@ define i1 @lshrult_01_00(i4 %x) { define i1 @lshrult_01_01(i4 %x) { ; CHECK-LABEL: @lshrult_01_01( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, 2 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], 2 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -464,7 +465,7 @@ define i1 @lshrult_01_01(i4 %x) { define i1 @lshrult_01_02(i4 %x) { ; CHECK-LABEL: @lshrult_01_02( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -474,7 +475,7 @@ define i1 @lshrult_01_02(i4 %x) { define i1 @lshrult_01_03(i4 %x) { ; CHECK-LABEL: @lshrult_01_03( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, 6 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], 6 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -484,7 +485,7 @@ define i1 @lshrult_01_03(i4 %x) { define i1 @lshrult_01_04(i4 %x) { ; CHECK-LABEL: @lshrult_01_04( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -494,7 +495,7 @@ define i1 @lshrult_01_04(i4 %x) { define i1 @lshrult_01_05(i4 %x) { ; CHECK-LABEL: @lshrult_01_05( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, -6 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], -6 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -504,7 +505,7 @@ define i1 @lshrult_01_05(i4 %x) { define i1 @lshrult_01_06(i4 %x) { ; CHECK-LABEL: @lshrult_01_06( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -514,7 +515,7 @@ define i1 @lshrult_01_06(i4 %x) { define i1 @lshrult_01_07(i4 %x) { ; CHECK-LABEL: @lshrult_01_07( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, -2 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], -2 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 1 @@ -605,7 +606,7 @@ define i1 @lshrult_02_00(i4 %x) { define i1 @lshrult_02_01(i4 %x) { ; CHECK-LABEL: @lshrult_02_01( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 2 @@ -615,7 +616,7 @@ define i1 @lshrult_02_01(i4 %x) { define i1 @lshrult_02_02(i4 %x) { ; CHECK-LABEL: @lshrult_02_02( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 2 @@ -625,7 +626,7 @@ define i1 @lshrult_02_02(i4 %x) { define i1 @lshrult_02_03(i4 %x) { ; CHECK-LABEL: @lshrult_02_03( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 2 @@ -752,7 +753,7 @@ define i1 @lshrult_03_00(i4 %x) { define i1 @lshrult_03_01(i4 %x) { ; CHECK-LABEL: @lshrult_03_01( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr i4 %x, 3 @@ -888,7 +889,7 @@ define i1 @lshrult_03_15(i4 %x) { define i1 @ashrsgt_01_00(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_00( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 1 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -898,7 +899,7 @@ define i1 @ashrsgt_01_00(i4 %x) { define i1 @ashrsgt_01_01(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_01( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 3 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 3 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -908,7 +909,7 @@ define i1 @ashrsgt_01_01(i4 %x) { define i1 @ashrsgt_01_02(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_02( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 5 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 5 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -999,7 +1000,7 @@ define i1 @ashrsgt_01_11(i4 %x) { define i1 @ashrsgt_01_12(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_12( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -7 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -7 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1009,7 +1010,7 @@ define i1 @ashrsgt_01_12(i4 %x) { define i1 @ashrsgt_01_13(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_13( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -5 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -5 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1019,7 +1020,7 @@ define i1 @ashrsgt_01_13(i4 %x) { define i1 @ashrsgt_01_14(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_14( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -3 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -3 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1029,7 +1030,7 @@ define i1 @ashrsgt_01_14(i4 %x) { define i1 @ashrsgt_01_15(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_15( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1039,7 +1040,7 @@ define i1 @ashrsgt_01_15(i4 %x) { define i1 @ashrsgt_02_00(i4 %x) { ; CHECK-LABEL: @ashrsgt_02_00( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 3 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 3 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 2 @@ -1166,7 +1167,7 @@ define i1 @ashrsgt_02_13(i4 %x) { define i1 @ashrsgt_02_14(i4 %x) { ; CHECK-LABEL: @ashrsgt_02_14( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -5 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -5 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 2 @@ -1176,7 +1177,7 @@ define i1 @ashrsgt_02_14(i4 %x) { define i1 @ashrsgt_02_15(i4 %x) { ; CHECK-LABEL: @ashrsgt_02_15( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 2 @@ -1321,7 +1322,7 @@ define i1 @ashrsgt_03_14(i4 %x) { define i1 @ashrsgt_03_15(i4 %x) { ; CHECK-LABEL: @ashrsgt_03_15( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 3 @@ -1331,7 +1332,7 @@ define i1 @ashrsgt_03_15(i4 %x) { define i1 @ashrslt_01_00(i4 %x) { ; CHECK-LABEL: @ashrslt_01_00( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1341,7 +1342,7 @@ define i1 @ashrslt_01_00(i4 %x) { define i1 @ashrslt_01_01(i4 %x) { ; CHECK-LABEL: @ashrslt_01_01( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 2 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 2 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1351,7 +1352,7 @@ define i1 @ashrslt_01_01(i4 %x) { define i1 @ashrslt_01_02(i4 %x) { ; CHECK-LABEL: @ashrslt_01_02( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1361,7 +1362,7 @@ define i1 @ashrslt_01_02(i4 %x) { define i1 @ashrslt_01_03(i4 %x) { ; CHECK-LABEL: @ashrslt_01_03( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 6 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 6 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1452,7 +1453,7 @@ define i1 @ashrslt_01_12(i4 %x) { define i1 @ashrslt_01_13(i4 %x) { ; CHECK-LABEL: @ashrslt_01_13( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -6 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -6 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1462,7 +1463,7 @@ define i1 @ashrslt_01_13(i4 %x) { define i1 @ashrslt_01_14(i4 %x) { ; CHECK-LABEL: @ashrslt_01_14( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1472,7 +1473,7 @@ define i1 @ashrslt_01_14(i4 %x) { define i1 @ashrslt_01_15(i4 %x) { ; CHECK-LABEL: @ashrslt_01_15( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -2 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -2 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 1 @@ -1482,7 +1483,7 @@ define i1 @ashrslt_01_15(i4 %x) { define i1 @ashrslt_02_00(i4 %x) { ; CHECK-LABEL: @ashrslt_02_00( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 2 @@ -1492,7 +1493,7 @@ define i1 @ashrslt_02_00(i4 %x) { define i1 @ashrslt_02_01(i4 %x) { ; CHECK-LABEL: @ashrslt_02_01( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 2 @@ -1619,7 +1620,7 @@ define i1 @ashrslt_02_14(i4 %x) { define i1 @ashrslt_02_15(i4 %x) { ; CHECK-LABEL: @ashrslt_02_15( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 2 @@ -1629,7 +1630,7 @@ define i1 @ashrslt_02_15(i4 %x) { define i1 @ashrslt_03_00(i4 %x) { ; CHECK-LABEL: @ashrslt_03_00( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr i4 %x, 3 @@ -1774,7 +1775,7 @@ define i1 @ashrslt_03_15(i4 %x) { define i1 @lshrugt_01_00_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_01_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -1784,7 +1785,7 @@ define i1 @lshrugt_01_00_exact(i4 %x) { define i1 @lshrugt_01_01_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_01_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 2 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 2 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -1794,7 +1795,7 @@ define i1 @lshrugt_01_01_exact(i4 %x) { define i1 @lshrugt_01_02_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_01_02_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -1804,7 +1805,7 @@ define i1 @lshrugt_01_02_exact(i4 %x) { define i1 @lshrugt_01_03_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_01_03_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 6 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 6 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -1814,7 +1815,7 @@ define i1 @lshrugt_01_03_exact(i4 %x) { define i1 @lshrugt_01_04_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_01_04_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, -8 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], -8 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -1824,7 +1825,7 @@ define i1 @lshrugt_01_04_exact(i4 %x) { define i1 @lshrugt_01_05_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_01_05_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, -6 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], -6 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -1834,7 +1835,7 @@ define i1 @lshrugt_01_05_exact(i4 %x) { define i1 @lshrugt_01_06_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_01_06_exact( -; CHECK-NEXT: [[C:%.*]] = icmp eq i4 %x, -2 +; CHECK-NEXT: [[C:%.*]] = icmp eq i4 [[X:%.*]], -2 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -1925,7 +1926,7 @@ define i1 @lshrugt_01_15_exact(i4 %x) { define i1 @lshrugt_02_00_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_02_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 2 @@ -1935,7 +1936,7 @@ define i1 @lshrugt_02_00_exact(i4 %x) { define i1 @lshrugt_02_01_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_02_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 2 @@ -1945,7 +1946,7 @@ define i1 @lshrugt_02_01_exact(i4 %x) { define i1 @lshrugt_02_02_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_02_02_exact( -; CHECK-NEXT: [[C:%.*]] = icmp eq i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp eq i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 2 @@ -2072,7 +2073,7 @@ define i1 @lshrugt_02_15_exact(i4 %x) { define i1 @lshrugt_03_00_exact(i4 %x) { ; CHECK-LABEL: @lshrugt_03_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 3 @@ -2226,7 +2227,7 @@ define i1 @lshrult_01_00_exact(i4 %x) { define i1 @lshrult_01_01_exact(i4 %x) { ; CHECK-LABEL: @lshrult_01_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp eq i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp eq i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -2236,7 +2237,7 @@ define i1 @lshrult_01_01_exact(i4 %x) { define i1 @lshrult_01_02_exact(i4 %x) { ; CHECK-LABEL: @lshrult_01_02_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -2246,7 +2247,7 @@ define i1 @lshrult_01_02_exact(i4 %x) { define i1 @lshrult_01_03_exact(i4 %x) { ; CHECK-LABEL: @lshrult_01_03_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, 6 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], 6 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -2256,7 +2257,7 @@ define i1 @lshrult_01_03_exact(i4 %x) { define i1 @lshrult_01_04_exact(i4 %x) { ; CHECK-LABEL: @lshrult_01_04_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -2266,7 +2267,7 @@ define i1 @lshrult_01_04_exact(i4 %x) { define i1 @lshrult_01_05_exact(i4 %x) { ; CHECK-LABEL: @lshrult_01_05_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, -6 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], -6 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -2276,7 +2277,7 @@ define i1 @lshrult_01_05_exact(i4 %x) { define i1 @lshrult_01_06_exact(i4 %x) { ; CHECK-LABEL: @lshrult_01_06_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ult i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -2286,7 +2287,7 @@ define i1 @lshrult_01_06_exact(i4 %x) { define i1 @lshrult_01_07_exact(i4 %x) { ; CHECK-LABEL: @lshrult_01_07_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, -2 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], -2 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 1 @@ -2377,7 +2378,7 @@ define i1 @lshrult_02_00_exact(i4 %x) { define i1 @lshrult_02_01_exact(i4 %x) { ; CHECK-LABEL: @lshrult_02_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp eq i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp eq i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 2 @@ -2387,7 +2388,7 @@ define i1 @lshrult_02_01_exact(i4 %x) { define i1 @lshrult_02_02_exact(i4 %x) { ; CHECK-LABEL: @lshrult_02_02_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 2 @@ -2397,7 +2398,7 @@ define i1 @lshrult_02_02_exact(i4 %x) { define i1 @lshrult_02_03_exact(i4 %x) { ; CHECK-LABEL: @lshrult_02_03_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 2 @@ -2524,7 +2525,7 @@ define i1 @lshrult_03_00_exact(i4 %x) { define i1 @lshrult_03_01_exact(i4 %x) { ; CHECK-LABEL: @lshrult_03_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, -8 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], -8 ; CHECK-NEXT: ret i1 [[C]] ; %s = lshr exact i4 %x, 3 @@ -2660,7 +2661,7 @@ define i1 @lshrult_03_15_exact(i4 %x) { define i1 @ashrsgt_01_00_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -2670,7 +2671,7 @@ define i1 @ashrsgt_01_00_exact(i4 %x) { define i1 @ashrsgt_01_01_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 2 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 2 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -2680,7 +2681,7 @@ define i1 @ashrsgt_01_01_exact(i4 %x) { define i1 @ashrsgt_01_02_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_02_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -2771,7 +2772,7 @@ define i1 @ashrsgt_01_11_exact(i4 %x) { define i1 @ashrsgt_01_12_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_12_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, -8 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], -8 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -2781,7 +2782,7 @@ define i1 @ashrsgt_01_12_exact(i4 %x) { define i1 @ashrsgt_01_13_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_13_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -6 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -6 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -2791,7 +2792,7 @@ define i1 @ashrsgt_01_13_exact(i4 %x) { define i1 @ashrsgt_01_14_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_14_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -2801,7 +2802,7 @@ define i1 @ashrsgt_01_14_exact(i4 %x) { define i1 @ashrsgt_01_15_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_01_15_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -2811,7 +2812,7 @@ define i1 @ashrsgt_01_15_exact(i4 %x) { define i1 @ashrsgt_02_00_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_02_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 2 @@ -2938,7 +2939,7 @@ define i1 @ashrsgt_02_13_exact(i4 %x) { define i1 @ashrsgt_02_14_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_02_14_exact( -; CHECK-NEXT: [[C:%.*]] = icmp ne i4 %x, -8 +; CHECK-NEXT: [[C:%.*]] = icmp ne i4 [[X:%.*]], -8 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 2 @@ -2948,7 +2949,7 @@ define i1 @ashrsgt_02_14_exact(i4 %x) { define i1 @ashrsgt_02_15_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_02_15_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 2 @@ -3093,7 +3094,7 @@ define i1 @ashrsgt_03_14_exact(i4 %x) { define i1 @ashrsgt_03_15_exact(i4 %x) { ; CHECK-LABEL: @ashrsgt_03_15_exact( -; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 %x, -1 +; CHECK-NEXT: [[C:%.*]] = icmp sgt i4 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 3 @@ -3103,7 +3104,7 @@ define i1 @ashrsgt_03_15_exact(i4 %x) { define i1 @ashrslt_01_00_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_01_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -3113,7 +3114,7 @@ define i1 @ashrslt_01_00_exact(i4 %x) { define i1 @ashrslt_01_01_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_01_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 2 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 2 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -3123,7 +3124,7 @@ define i1 @ashrslt_01_01_exact(i4 %x) { define i1 @ashrslt_01_02_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_01_02_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -3133,7 +3134,7 @@ define i1 @ashrslt_01_02_exact(i4 %x) { define i1 @ashrslt_01_03_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_01_03_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 6 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 6 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -3224,7 +3225,7 @@ define i1 @ashrslt_01_12_exact(i4 %x) { define i1 @ashrslt_01_13_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_01_13_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -6 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -6 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -3234,7 +3235,7 @@ define i1 @ashrslt_01_13_exact(i4 %x) { define i1 @ashrslt_01_14_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_01_14_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -3244,7 +3245,7 @@ define i1 @ashrslt_01_14_exact(i4 %x) { define i1 @ashrslt_01_15_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_01_15_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -2 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -2 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 1 @@ -3254,7 +3255,7 @@ define i1 @ashrslt_01_15_exact(i4 %x) { define i1 @ashrslt_02_00_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_02_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 2 @@ -3264,7 +3265,7 @@ define i1 @ashrslt_02_00_exact(i4 %x) { define i1 @ashrslt_02_01_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_02_01_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 2 @@ -3391,7 +3392,7 @@ define i1 @ashrslt_02_14_exact(i4 %x) { define i1 @ashrslt_02_15_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_02_15_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, -4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], -4 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 2 @@ -3401,7 +3402,7 @@ define i1 @ashrslt_02_15_exact(i4 %x) { define i1 @ashrslt_03_00_exact(i4 %x) { ; CHECK-LABEL: @ashrslt_03_00_exact( -; CHECK-NEXT: [[C:%.*]] = icmp slt i4 %x, 0 +; CHECK-NEXT: [[C:%.*]] = icmp slt i4 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %s = ashr exact i4 %x, 3 diff --git a/llvm/test/Transforms/InstCombine/load-bitcast-vec.ll b/llvm/test/Transforms/InstCombine/load-bitcast-vec.ll index e6540ee70611f9..cb1b224e060913 100644 --- a/llvm/test/Transforms/InstCombine/load-bitcast-vec.ll +++ b/llvm/test/Transforms/InstCombine/load-bitcast-vec.ll @@ -67,6 +67,41 @@ define float @matching_scalar_small_deref(<4 x float>* dereferenceable(15) %p) { ret float %r } +define float @matching_scalar_smallest_deref(<4 x float>* dereferenceable(1) %p) { +; CHECK-LABEL: @matching_scalar_smallest_deref( +; CHECK-NEXT: [[BC:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 +; CHECK-NEXT: ret float [[R]] +; + %bc = bitcast <4 x float>* %p to float* + %r = load float, float* %bc, align 16 + ret float %r +} + +define float @matching_scalar_smallest_deref_or_null(<4 x float>* dereferenceable_or_null(1) %p) { +; CHECK-LABEL: @matching_scalar_smallest_deref_or_null( +; CHECK-NEXT: [[BC:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[R:%.*]] = load float, float* [[BC]], align 16 +; CHECK-NEXT: ret float [[R]] +; + %bc = bitcast <4 x float>* %p to float* + %r = load float, float* %bc, align 16 + ret float %r +} + +; TODO: Is a null pointer inbounds in any address space? + +define float @matching_scalar_smallest_deref_or_null_addrspace(<4 x float> addrspace(4)* dereferenceable_or_null(1) %p) { +; CHECK-LABEL: @matching_scalar_smallest_deref_or_null_addrspace( +; CHECK-NEXT: [[BC:%.*]] = getelementptr inbounds <4 x float>, <4 x float> addrspace(4)* [[P:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[R:%.*]] = load float, float addrspace(4)* [[BC]], align 16 +; CHECK-NEXT: ret float [[R]] +; + %bc = bitcast <4 x float> addrspace(4)* %p to float addrspace(4)* + %r = load float, float addrspace(4)* %bc, align 16 + ret float %r +} + define float @matching_scalar_volatile(<4 x float>* dereferenceable(16) %p) { ; CHECK-LABEL: @matching_scalar_volatile( ; CHECK-NEXT: [[BC:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0 diff --git a/llvm/test/Transforms/InstCombine/memcpy-1.ll b/llvm/test/Transforms/InstCombine/memcpy-1.ll index ef020726dbee56..789e5ebd746780 100644 --- a/llvm/test/Transforms/InstCombine/memcpy-1.ll +++ b/llvm/test/Transforms/InstCombine/memcpy-1.ll @@ -20,7 +20,7 @@ define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) { ; Verify that the strictfp attr doesn't block this optimization. -define i8* @test_simplify2(i8* %mem1, i8* %mem2, i32 %size) { +define i8* @test_simplify2(i8* %mem1, i8* %mem2, i32 %size) strictfp { ; CHECK-LABEL: @test_simplify2( ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[MEM1:%.*]], i8* align 1 [[MEM2:%.*]], i32 [[SIZE:%.*]], i1 false) ; CHECK-NEXT: ret i8* [[MEM1]] diff --git a/llvm/test/Transforms/InstCombine/memset.ll b/llvm/test/Transforms/InstCombine/memset.ll index 7d531f2965d0d9..b994d97c7a2571 100644 --- a/llvm/test/Transforms/InstCombine/memset.ll +++ b/llvm/test/Transforms/InstCombine/memset.ll @@ -3,7 +3,7 @@ define i32 @test([1024 x i8]* %target) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[TARGET:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [1024 x i8], [1024 x i8]* [[TARGET:%.*]], i64 0, i64 0 ; CHECK-NEXT: store i8 1, i8* [[TMP1]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast [1024 x i8]* [[TARGET]] to i16* ; CHECK-NEXT: store i16 257, i16* [[TMP2]], align 2 diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll new file mode 100644 index 00000000000000..a0175387d1cbae --- /dev/null +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +; Given pattern: +; (trunc (iSrc x a>> Q) to iDst) a>> K +; we should rewrite it as +; (trunc (iSrc x a>> (Q+K)) to iDst) +; iff (Q+K) is bitwidth(iSrc)-1 +; THIS FOLD DOES *NOT* REQUIRE ANY 'nuw'/`nsw` FLAGS! + +; Basic scalar test + +define i16 @t0(i32 %x, i16 %y) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[T5:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = ashr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + %t5 = ashr i16 %t3, %t4 + ret i16 %t5 +} + +; Basic vector tests + +define <2 x i16> @t1_vec_splat(<2 x i32> %x, <2 x i16> %y) { +; CHECK-LABEL: @t1_vec_splat( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <2 x i32> [[TMP1]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[T5]] +; + %t0 = sub <2 x i16> , %y + %t1 = zext <2 x i16> %t0 to <2 x i32> + %t2 = ashr <2 x i32> %x, %t1 + %t3 = trunc <2 x i32> %t2 to <2 x i16> + %t4 = add <2 x i16> %y, + %t5 = ashr <2 x i16> %t3, %t4 + ret <2 x i16> %t5 +} + +define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t3_vec_nonsplat_undef0( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +; CHECK-NEXT: ret <3 x i16> [[T5]] +; + %t0 = sub <3 x i16> , %y + %t1 = zext <3 x i16> %t0 to <3 x i32> + %t2 = ashr <3 x i32> %x, %t1 + %t3 = trunc <3 x i32> %t2 to <3 x i16> + %t4 = add <3 x i16> %y, + %t5 = ashr <3 x i16> %t3, %t4 + ret <3 x i16> %t5 +} + +define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t4_vec_nonsplat_undef1( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +; CHECK-NEXT: ret <3 x i16> [[T5]] +; + %t0 = sub <3 x i16> , %y + %t1 = zext <3 x i16> %t0 to <3 x i32> + %t2 = ashr <3 x i32> %x, %t1 + %t3 = trunc <3 x i32> %t2 to <3 x i16> + %t4 = add <3 x i16> %y, + %t5 = ashr <3 x i16> %t3, %t4 + ret <3 x i16> %t5 +} + +define <3 x i16> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t5_vec_nonsplat_undef1( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +; CHECK-NEXT: ret <3 x i16> [[T5]] +; + %t0 = sub <3 x i16> , %y + %t1 = zext <3 x i16> %t0 to <3 x i32> + %t2 = ashr <3 x i32> %x, %t1 + %t3 = trunc <3 x i32> %t2 to <3 x i16> + %t4 = add <3 x i16> %y, + %t5 = ashr <3 x i16> %t3, %t4 + ret <3 x i16> %t5 +} + +; One-use tests + +declare void @use16(i16) +declare void @use32(i32) + +define i16 @t6_extrause0(i32 %x, i16 %y) { +; CHECK-LABEL: @t6_extrause0( +; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 +; CHECK-NEXT: [[T2:%.*]] = ashr i32 [[X:%.*]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 +; CHECK-NEXT: call void @use16(i16 [[T3]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X]], 31 +; CHECK-NEXT: [[T5:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = ashr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + call void @use16(i16 %t3) + %t5 = ashr i16 %t3, %t4 + ret i16 %t5 +} + +define i16 @t7_extrause1(i32 %x, i16 %y) { +; CHECK-LABEL: @t7_extrause1( +; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y:%.*]], -1 +; CHECK-NEXT: call void @use16(i16 [[T4]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[T5:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = ashr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + call void @use16(i16 %t4) + %t5 = ashr i16 %t3, %t4 + ret i16 %t5 +} + +define i16 @t8_extrause2(i32 %x, i16 %y) { +; CHECK-LABEL: @t8_extrause2( +; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 +; CHECK-NEXT: [[T2:%.*]] = ashr i32 [[X:%.*]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 +; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y]], -1 +; CHECK-NEXT: call void @use16(i16 [[T3]]) +; CHECK-NEXT: call void @use16(i16 [[T4]]) +; CHECK-NEXT: [[T5:%.*]] = ashr i16 [[T3]], [[T4]] +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = ashr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + call void @use16(i16 %t3) + call void @use16(i16 %t4) + %t5 = ashr i16 %t3, %t4 + ret i16 %t5 +} + +; No 'nuw'/'nsw' flags are to be propagated! +; But we can't test that, such IR does not reach that code. + +; Negative tests + +; Can only fold if we are extracting the sign bit. +define i16 @t9_ashr(i32 %x, i16 %y) { +; CHECK-LABEL: @t9_ashr( +; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 +; CHECK-NEXT: [[T2:%.*]] = ashr i32 [[X:%.*]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 +; CHECK-NEXT: ret i16 [[T3]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = ashr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -2 + %t5 = ashr i16 %t3, %t4 + ret i16 %t3 +} diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll new file mode 100644 index 00000000000000..7b9962eacb117f --- /dev/null +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +; Given pattern: +; (trunc (iSrc x l>> Q) to iDst) l>> K +; we should rewrite it as +; (trunc (iSrc x l>> (Q+K)) to iDst) +; iff (Q+K) is bitwidth(iSrc)-1 +; THIS FOLD DOES *NOT* REQUIRE ANY 'nuw'/`nsw` FLAGS! + +; Basic scalar test + +define i16 @t0(i32 %x, i16 %y) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[T5:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = lshr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + %t5 = lshr i16 %t3, %t4 + ret i16 %t5 +} + +; Basic vector tests + +define <2 x i16> @t1_vec_splat(<2 x i32> %x, <2 x i16> %y) { +; CHECK-LABEL: @t1_vec_splat( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <2 x i32> [[TMP1]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[T5]] +; + %t0 = sub <2 x i16> , %y + %t1 = zext <2 x i16> %t0 to <2 x i32> + %t2 = lshr <2 x i32> %x, %t1 + %t3 = trunc <2 x i32> %t2 to <2 x i16> + %t4 = add <2 x i16> %y, + %t5 = lshr <2 x i16> %t3, %t4 + ret <2 x i16> %t5 +} + +define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t3_vec_nonsplat_undef0( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +; CHECK-NEXT: ret <3 x i16> [[T5]] +; + %t0 = sub <3 x i16> , %y + %t1 = zext <3 x i16> %t0 to <3 x i32> + %t2 = lshr <3 x i32> %x, %t1 + %t3 = trunc <3 x i32> %t2 to <3 x i16> + %t4 = add <3 x i16> %y, + %t5 = lshr <3 x i16> %t3, %t4 + ret <3 x i16> %t5 +} + +define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t4_vec_nonsplat_undef1( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +; CHECK-NEXT: ret <3 x i16> [[T5]] +; + %t0 = sub <3 x i16> , %y + %t1 = zext <3 x i16> %t0 to <3 x i32> + %t2 = lshr <3 x i32> %x, %t1 + %t3 = trunc <3 x i32> %t2 to <3 x i16> + %t4 = add <3 x i16> %y, + %t5 = lshr <3 x i16> %t3, %t4 + ret <3 x i16> %t5 +} + +define <3 x i16> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t5_vec_nonsplat_undef1( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +; CHECK-NEXT: ret <3 x i16> [[T5]] +; + %t0 = sub <3 x i16> , %y + %t1 = zext <3 x i16> %t0 to <3 x i32> + %t2 = lshr <3 x i32> %x, %t1 + %t3 = trunc <3 x i32> %t2 to <3 x i16> + %t4 = add <3 x i16> %y, + %t5 = lshr <3 x i16> %t3, %t4 + ret <3 x i16> %t5 +} + +; One-use tests + +declare void @use16(i16) +declare void @use32(i32) + +define i16 @t6_extrause0(i32 %x, i16 %y) { +; CHECK-LABEL: @t6_extrause0( +; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 +; CHECK-NEXT: [[T2:%.*]] = lshr i32 [[X:%.*]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 +; CHECK-NEXT: call void @use16(i16 [[T3]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X]], 31 +; CHECK-NEXT: [[T5:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = lshr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + call void @use16(i16 %t3) + %t5 = lshr i16 %t3, %t4 + ret i16 %t5 +} + +define i16 @t7_extrause1(i32 %x, i16 %y) { +; CHECK-LABEL: @t7_extrause1( +; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y:%.*]], -1 +; CHECK-NEXT: call void @use16(i16 [[T4]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 31 +; CHECK-NEXT: [[T5:%.*]] = trunc i32 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = lshr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + call void @use16(i16 %t4) + %t5 = lshr i16 %t3, %t4 + ret i16 %t5 +} + +define i16 @t8_extrause2(i32 %x, i16 %y) { +; CHECK-LABEL: @t8_extrause2( +; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 +; CHECK-NEXT: [[T2:%.*]] = lshr i32 [[X:%.*]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 +; CHECK-NEXT: [[T4:%.*]] = add i16 [[Y]], -1 +; CHECK-NEXT: call void @use16(i16 [[T3]]) +; CHECK-NEXT: call void @use16(i16 [[T4]]) +; CHECK-NEXT: [[T5:%.*]] = lshr i16 [[T3]], [[T4]] +; CHECK-NEXT: ret i16 [[T5]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = lshr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -1 + call void @use16(i16 %t3) + call void @use16(i16 %t4) + %t5 = lshr i16 %t3, %t4 + ret i16 %t5 +} + +; No 'nuw'/'nsw' flags are to be propagated! +; But we can't test that, such IR does not reach that code. + +; Negative tests + +; Can only fold if we are extracting the sign bit. +define i16 @t9_lshr(i32 %x, i16 %y) { +; CHECK-LABEL: @t9_lshr( +; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 +; CHECK-NEXT: [[T2:%.*]] = lshr i32 [[X:%.*]], [[T1]] +; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 +; CHECK-NEXT: ret i16 [[T3]] +; + %t0 = sub i16 32, %y + %t1 = zext i16 %t0 to i32 + %t2 = lshr i32 %x, %t1 + %t3 = trunc i32 %t2 to i16 + %t4 = add i16 %y, -2 + %t5 = lshr i16 %t3, %t4 + ret i16 %t3 +} diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-shl.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-shl.ll index 6675785929c048..2328ec7965e7f1 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-shl.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-shl.ll @@ -174,40 +174,6 @@ define i16 @t8_extrause2(i32 %x, i16 %y) { ; Negative tests -; No folding possible for right-shifts.. -define i16 @t9_shl(i32 %x, i16 %y) { -; CHECK-LABEL: @t9_shl( -; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 -; CHECK-NEXT: [[T2:%.*]] = lshr i32 [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 -; CHECK-NEXT: ret i16 [[T3]] -; - %t0 = sub i16 32, %y - %t1 = zext i16 %t0 to i32 - %t2 = lshr i32 %x, %t1 - %t3 = trunc i32 %t2 to i16 - %t4 = add i16 %y, -24 - %t5 = lshr i16 %t3, %t4 - ret i16 %t3 -} -define i16 @t10_ashr(i32 %x, i16 %y) { -; CHECK-LABEL: @t10_ashr( -; CHECK-NEXT: [[T0:%.*]] = sub i16 32, [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = zext i16 [[T0]] to i32 -; CHECK-NEXT: [[T2:%.*]] = ashr i32 [[X:%.*]], [[T1]] -; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i16 -; CHECK-NEXT: ret i16 [[T3]] -; - %t0 = sub i16 32, %y - %t1 = zext i16 %t0 to i32 - %t2 = ashr i32 %x, %t1 - %t3 = trunc i32 %t2 to i16 - %t4 = add i16 %y, -24 - %t5 = ashr i16 %t3, %t4 - ret i16 %t3 -} - ; Can't fold, total shift would be 32 define i16 @n11(i32 %x, i16 %y) { ; CHECK-LABEL: @n11( diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll index a0e6bbe33ee824..9ded69ad7b9001 100644 --- a/llvm/test/Transforms/InstCombine/shift.ll +++ b/llvm/test/Transforms/InstCombine/shift.ll @@ -21,7 +21,7 @@ define <4 x i32> @shl_non_splat_vector(<4 x i32> %A) { define i32 @test6(i32 %A) { ; CHECK-LABEL: @test6( -; CHECK-NEXT: [[C:%.*]] = mul i32 %A, 6 +; CHECK-NEXT: [[C:%.*]] = mul i32 [[A:%.*]], 6 ; CHECK-NEXT: ret i32 [[C]] ; %B = shl i32 %A, 1 ;; convert to an mul instruction @@ -31,7 +31,7 @@ define i32 @test6(i32 %A) { define i32 @test6a(i32 %A) { ; CHECK-LABEL: @test6a( -; CHECK-NEXT: [[C:%.*]] = mul i32 %A, 6 +; CHECK-NEXT: [[C:%.*]] = mul i32 [[A:%.*]], 6 ; CHECK-NEXT: ret i32 [[C]] ; %B = mul i32 %A, 3 @@ -52,7 +52,7 @@ define i8 @test8(i8 %A) { ;; (A << 7) >> 7 === A & 1 define i8 @test9(i8 %A) { ; CHECK-LABEL: @test9( -; CHECK-NEXT: [[B:%.*]] = and i8 %A, 1 +; CHECK-NEXT: [[B:%.*]] = and i8 [[A:%.*]], 1 ; CHECK-NEXT: ret i8 [[B]] ; %B = shl i8 %A, 7 @@ -64,7 +64,7 @@ define i8 @test9(i8 %A) { define i8 @test10(i8 %A) { ; CHECK-LABEL: @test10( -; CHECK-NEXT: [[B:%.*]] = and i8 %A, -128 +; CHECK-NEXT: [[B:%.*]] = and i8 [[A:%.*]], -128 ; CHECK-NEXT: ret i8 [[B]] ; %B = lshr i8 %A, 7 @@ -75,7 +75,7 @@ define i8 @test10(i8 %A) { ;; Allow the simplification when the lshr shift is exact. define i8 @test10a(i8 %A) { ; CHECK-LABEL: @test10a( -; CHECK-NEXT: ret i8 %A +; CHECK-NEXT: ret i8 [[A:%.*]] ; %B = lshr exact i8 %A, 7 %C = shl i8 %B, 7 @@ -85,14 +85,14 @@ define i8 @test10a(i8 %A) { ;; This transformation is deferred to DAGCombine: ;; (A >> 3) << 4 === (A & 0x1F) << 1 ;; The shl may be valuable to scalar evolution. -define i8 @test11(i8 %A) { +define i8 @test11(i8 %x) { ; CHECK-LABEL: @test11( -; CHECK-NEXT: [[A:%.*]] = mul i8 %A, 3 +; CHECK-NEXT: [[A:%.*]] = mul i8 [[X:%.*]], 3 ; CHECK-NEXT: [[B:%.*]] = lshr i8 [[A]], 3 ; CHECK-NEXT: [[C:%.*]] = shl i8 [[B]], 4 ; CHECK-NEXT: ret i8 [[C]] ; - %a = mul i8 %A, 3 + %a = mul i8 %x, 3 %B = lshr i8 %a, 3 %C = shl i8 %B, 4 ret i8 %C @@ -101,7 +101,7 @@ define i8 @test11(i8 %A) { ;; Allow the simplification in InstCombine when the lshr shift is exact. define i8 @test11a(i8 %A) { ; CHECK-LABEL: @test11a( -; CHECK-NEXT: [[C:%.*]] = mul i8 %A, 6 +; CHECK-NEXT: [[C:%.*]] = mul i8 [[A:%.*]], 6 ; CHECK-NEXT: ret i8 [[C]] ; %a = mul i8 %A, 3 @@ -114,8 +114,8 @@ define i8 @test11a(i8 %A) { ;; (A >> 8) << 8 === A & -256 define i32 @test12(i32 %A) { ; CHECK-LABEL: @test12( -; CHECK-NEXT: [[B1:%.*]] = and i32 %A, -256 -; CHECK-NEXT: ret i32 [[B1]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], -256 +; CHECK-NEXT: ret i32 [[TMP1]] ; %B = ashr i32 %A, 8 %C = shl i32 %B, 8 @@ -141,14 +141,14 @@ define i8 @shishi(i8 %x) { ;; This transformation is deferred to DAGCombine: ;; (A >> 3) << 4 === (A & -8) * 2 ;; The shl may be valuable to scalar evolution. -define i8 @test13(i8 %A) { +define i8 @test13(i8 %x) { ; CHECK-LABEL: @test13( -; CHECK-NEXT: [[A:%.*]] = mul i8 %A, 3 -; CHECK-NEXT: [[B1:%.*]] = lshr i8 [[A]], 3 -; CHECK-NEXT: [[C:%.*]] = shl i8 [[B1]], 4 +; CHECK-NEXT: [[A:%.*]] = mul i8 [[X:%.*]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 [[A]], 3 +; CHECK-NEXT: [[C:%.*]] = shl i8 [[TMP1]], 4 ; CHECK-NEXT: ret i8 [[C]] ; - %a = mul i8 %A, 3 + %a = mul i8 %x, 3 %B = ashr i8 %a, 3 %C = shl i8 %B, 4 ret i8 %C @@ -156,7 +156,7 @@ define i8 @test13(i8 %A) { define i8 @test13a(i8 %A) { ; CHECK-LABEL: @test13a( -; CHECK-NEXT: [[C:%.*]] = mul i8 %A, 6 +; CHECK-NEXT: [[C:%.*]] = mul i8 [[A:%.*]], 6 ; CHECK-NEXT: ret i8 [[C]] ; %a = mul i8 %A, 3 @@ -168,7 +168,7 @@ define i8 @test13a(i8 %A) { ;; D = ((B | 1234) << 4) === ((B << 4)|(1234 << 4) define i32 @test14(i32 %A) { ; CHECK-LABEL: @test14( -; CHECK-NEXT: [[B:%.*]] = and i32 %A, -19760 +; CHECK-NEXT: [[B:%.*]] = and i32 [[A:%.*]], -19760 ; CHECK-NEXT: [[C:%.*]] = or i32 [[B]], 19744 ; CHECK-NEXT: ret i32 [[C]] ; @@ -181,7 +181,7 @@ define i32 @test14(i32 %A) { ;; D = ((B | 1234) << 4) === ((B << 4)|(1234 << 4) define i32 @test14a(i32 %A) { ; CHECK-LABEL: @test14a( -; CHECK-NEXT: [[C:%.*]] = and i32 %A, 77 +; CHECK-NEXT: [[C:%.*]] = and i32 [[A:%.*]], 77 ; CHECK-NEXT: ret i32 [[C]] ; %B = shl i32 %A, 4 @@ -192,7 +192,7 @@ define i32 @test14a(i32 %A) { define i32 @test15(i1 %C) { ; CHECK-LABEL: @test15( -; CHECK-NEXT: [[A:%.*]] = select i1 %C, i32 12, i32 4 +; CHECK-NEXT: [[A:%.*]] = select i1 [[C:%.*]], i32 12, i32 4 ; CHECK-NEXT: ret i32 [[A]] ; %A = select i1 %C, i32 3, i32 1 @@ -202,7 +202,7 @@ define i32 @test15(i1 %C) { define i32 @test15a(i1 %C) { ; CHECK-LABEL: @test15a( -; CHECK-NEXT: [[V:%.*]] = select i1 %C, i32 512, i32 128 +; CHECK-NEXT: [[V:%.*]] = select i1 [[C:%.*]], i32 512, i32 128 ; CHECK-NEXT: ret i32 [[V]] ; %A = select i1 %C, i8 3, i8 1 @@ -213,7 +213,7 @@ define i32 @test15a(i1 %C) { define i1 @test16(i32 %X) { ; CHECK-LABEL: @test16( -; CHECK-NEXT: [[TMP_6:%.*]] = and i32 %X, 16 +; CHECK-NEXT: [[TMP_6:%.*]] = and i32 [[X:%.*]], 16 ; CHECK-NEXT: [[TMP_7:%.*]] = icmp ne i32 [[TMP_6]], 0 ; CHECK-NEXT: ret i1 [[TMP_7]] ; @@ -225,7 +225,7 @@ define i1 @test16(i32 %X) { define i1 @test17(i32 %A) { ; CHECK-LABEL: @test17( -; CHECK-NEXT: [[B_MASK:%.*]] = and i32 %A, -8 +; CHECK-NEXT: [[B_MASK:%.*]] = and i32 [[A:%.*]], -8 ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[B_MASK]], 9872 ; CHECK-NEXT: ret i1 [[C]] ; @@ -236,7 +236,7 @@ define i1 @test17(i32 %A) { define <2 x i1> @test17vec(<2 x i32> %A) { ; CHECK-LABEL: @test17vec( -; CHECK-NEXT: [[B_MASK:%.*]] = and <2 x i32> %A, +; CHECK-NEXT: [[B_MASK:%.*]] = and <2 x i32> [[A:%.*]], ; CHECK-NEXT: [[C:%.*]] = icmp eq <2 x i32> [[B_MASK]], ; CHECK-NEXT: ret <2 x i1> [[C]] ; @@ -257,7 +257,7 @@ define i1 @test18(i8 %A) { define i1 @test19(i32 %A) { ; CHECK-LABEL: @test19( -; CHECK-NEXT: [[C:%.*]] = icmp ult i32 %A, 4 +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[A:%.*]], 4 ; CHECK-NEXT: ret i1 [[C]] ; %B = ashr i32 %A, 2 @@ -268,7 +268,7 @@ define i1 @test19(i32 %A) { define <2 x i1> @test19vec(<2 x i32> %A) { ; CHECK-LABEL: @test19vec( -; CHECK-NEXT: [[C:%.*]] = icmp ult <2 x i32> %A, +; CHECK-NEXT: [[C:%.*]] = icmp ult <2 x i32> [[A:%.*]], ; CHECK-NEXT: ret <2 x i1> [[C]] ; %B = ashr <2 x i32> %A, @@ -279,7 +279,7 @@ define <2 x i1> @test19vec(<2 x i32> %A) { ;; X >u ~4 define i1 @test19a(i32 %A) { ; CHECK-LABEL: @test19a( -; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 %A, -5 +; CHECK-NEXT: [[C:%.*]] = icmp ugt i32 [[A:%.*]], -5 ; CHECK-NEXT: ret i1 [[C]] ; %B = ashr i32 %A, 2 @@ -289,7 +289,7 @@ define i1 @test19a(i32 %A) { define <2 x i1> @test19a_vec(<2 x i32> %A) { ; CHECK-LABEL: @test19a_vec( -; CHECK-NEXT: [[C:%.*]] = icmp ugt <2 x i32> %A, +; CHECK-NEXT: [[C:%.*]] = icmp ugt <2 x i32> [[A:%.*]], ; CHECK-NEXT: ret <2 x i1> [[C]] ; %B = ashr <2 x i32> %A, @@ -309,7 +309,7 @@ define i1 @test20(i8 %A) { define i1 @test21(i8 %A) { ; CHECK-LABEL: @test21( -; CHECK-NEXT: [[B_MASK:%.*]] = and i8 %A, 15 +; CHECK-NEXT: [[B_MASK:%.*]] = and i8 [[A:%.*]], 15 ; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[B_MASK]], 8 ; CHECK-NEXT: ret i1 [[C]] ; @@ -320,7 +320,7 @@ define i1 @test21(i8 %A) { define i1 @test22(i8 %A) { ; CHECK-LABEL: @test22( -; CHECK-NEXT: [[B_MASK:%.*]] = and i8 %A, 15 +; CHECK-NEXT: [[B_MASK:%.*]] = and i8 [[A:%.*]], 15 ; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[B_MASK]], 0 ; CHECK-NEXT: ret i1 [[C]] ; @@ -331,7 +331,7 @@ define i1 @test22(i8 %A) { define i8 @test23(i32 %A) { ; CHECK-LABEL: @test23( -; CHECK-NEXT: [[D:%.*]] = trunc i32 %A to i8 +; CHECK-NEXT: [[D:%.*]] = trunc i32 [[A:%.*]] to i8 ; CHECK-NEXT: ret i8 [[D]] ; ;; casts not needed @@ -343,7 +343,7 @@ define i8 @test23(i32 %A) { define i8 @test24(i8 %X) { ; CHECK-LABEL: @test24( -; CHECK-NEXT: [[Z:%.*]] = and i8 %X, 3 +; CHECK-NEXT: [[Z:%.*]] = and i8 [[X:%.*]], 3 ; CHECK-NEXT: ret i8 [[Z]] ; %Y = and i8 %X, -5 @@ -354,8 +354,8 @@ define i8 @test24(i8 %X) { define i32 @test25(i32 %tmp.2, i32 %AA) { ; CHECK-LABEL: @test25( -; CHECK-NEXT: [[TMP_3:%.*]] = and i32 %tmp.2, -131072 -; CHECK-NEXT: [[X2:%.*]] = add i32 [[TMP_3]], %AA +; CHECK-NEXT: [[TMP_3:%.*]] = and i32 [[TMP_2:%.*]], -131072 +; CHECK-NEXT: [[X2:%.*]] = add i32 [[TMP_3]], [[AA:%.*]] ; CHECK-NEXT: [[TMP_6:%.*]] = and i32 [[X2]], -131072 ; CHECK-NEXT: ret i32 [[TMP_6]] ; @@ -368,8 +368,8 @@ define i32 @test25(i32 %tmp.2, i32 %AA) { define <2 x i32> @test25_vector(<2 x i32> %tmp.2, <2 x i32> %AA) { ; CHECK-LABEL: @test25_vector( -; CHECK-NEXT: [[TMP_3:%.*]] = and <2 x i32> %tmp.2, -; CHECK-NEXT: [[X2:%.*]] = add <2 x i32> [[TMP_3]], %AA +; CHECK-NEXT: [[TMP_3:%.*]] = and <2 x i32> [[TMP_2:%.*]], +; CHECK-NEXT: [[X2:%.*]] = add <2 x i32> [[TMP_3]], [[AA:%.*]] ; CHECK-NEXT: [[TMP_6:%.*]] = and <2 x i32> [[X2]], ; CHECK-NEXT: ret <2 x i32> [[TMP_6]] ; @@ -383,7 +383,7 @@ define <2 x i32> @test25_vector(<2 x i32> %tmp.2, <2 x i32> %AA) { ;; handle casts between shifts. define i32 @test26(i32 %A) { ; CHECK-LABEL: @test26( -; CHECK-NEXT: [[B:%.*]] = and i32 %A, -2 +; CHECK-NEXT: [[B:%.*]] = and i32 [[A:%.*]], -2 ; CHECK-NEXT: ret i32 [[B]] ; %B = lshr i32 %A, 1 @@ -395,7 +395,7 @@ define i32 @test26(i32 %A) { define i1 @test27(i32 %x) nounwind { ; CHECK-LABEL: @test27( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 %x, 8 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 8 ; CHECK-NEXT: [[Z:%.*]] = icmp ne i32 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[Z]] ; @@ -406,7 +406,7 @@ define i1 @test27(i32 %x) nounwind { define i1 @test28(i8 %x) { ; CHECK-LABEL: @test28( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 %x, 0 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[CMP]] ; %shr = lshr i8 %x, 7 @@ -416,7 +416,7 @@ define i1 @test28(i8 %x) { define <2 x i1> @test28vec(<2 x i8> %x) { ; CHECK-LABEL: @test28vec( -; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> %x, zeroinitializer +; CHECK-NEXT: [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %shr = lshr <2 x i8> %x, @@ -427,13 +427,13 @@ define <2 x i1> @test28vec(<2 x i8> %x) { define i8 @test28a(i8 %x, i8 %y) { ; CHECK-LABEL: @test28a( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 %x, 7 -; CHECK-NEXT: [[COND1:%.*]] = icmp eq i8 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[COND1]], label %bb2, label %bb1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i8 [[X:%.*]], 7 +; CHECK-NEXT: [[COND1:%.*]] = icmp slt i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND1]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: ret i8 [[TMP1]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], %y +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: ret i8 [[TMP2]] ; entry: @@ -452,7 +452,7 @@ bb2: define i32 @test29(i64 %d18) { ; CHECK-LABEL: @test29( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP916:%.*]] = lshr i64 %d18, 63 +; CHECK-NEXT: [[TMP916:%.*]] = lshr i64 [[D18:%.*]], 63 ; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP916]] to i32 ; CHECK-NEXT: ret i32 [[TMP10]] ; @@ -466,8 +466,8 @@ entry: define i32 @test30(i32 %A, i32 %B, i32 %C) { ; CHECK-LABEL: @test30( -; CHECK-NEXT: [[X1:%.*]] = and i32 %A, %B -; CHECK-NEXT: [[Z:%.*]] = shl i32 [[X1]], %C +; CHECK-NEXT: [[X1:%.*]] = and i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Z:%.*]] = shl i32 [[X1]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[Z]] ; %X = shl i32 %A, %C @@ -478,8 +478,8 @@ define i32 @test30(i32 %A, i32 %B, i32 %C) { define i32 @test31(i32 %A, i32 %B, i32 %C) { ; CHECK-LABEL: @test31( -; CHECK-NEXT: [[X1:%.*]] = or i32 %A, %B -; CHECK-NEXT: [[Z:%.*]] = lshr i32 [[X1]], %C +; CHECK-NEXT: [[X1:%.*]] = or i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Z:%.*]] = lshr i32 [[X1]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[Z]] ; %X = lshr i32 %A, %C @@ -490,8 +490,8 @@ define i32 @test31(i32 %A, i32 %B, i32 %C) { define i32 @test32(i32 %A, i32 %B, i32 %C) { ; CHECK-LABEL: @test32( -; CHECK-NEXT: [[X1:%.*]] = xor i32 %A, %B -; CHECK-NEXT: [[Z:%.*]] = ashr i32 [[X1]], %C +; CHECK-NEXT: [[X1:%.*]] = xor i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[Z:%.*]] = ashr i32 [[X1]], [[C:%.*]] ; CHECK-NEXT: ret i32 [[Z]] ; %X = ashr i32 %A, %C @@ -502,7 +502,7 @@ define i32 @test32(i32 %A, i32 %B, i32 %C) { define i1 @test33(i32 %X) { ; CHECK-LABEL: @test33( -; CHECK-NEXT: [[TMP1_MASK:%.*]] = and i32 %X, 16777216 +; CHECK-NEXT: [[TMP1_MASK:%.*]] = and i32 [[X:%.*]], 16777216 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1_MASK]], 0 ; CHECK-NEXT: ret i1 [[TMP2]] ; @@ -513,7 +513,7 @@ define i1 @test33(i32 %X) { define <2 x i1> @test33vec(<2 x i32> %X) { ; CHECK-LABEL: @test33vec( -; CHECK-NEXT: [[TMP1_MASK:%.*]] = and <2 x i32> %X, +; CHECK-NEXT: [[TMP1_MASK:%.*]] = and <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1_MASK]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[TMP2]] ; @@ -533,7 +533,7 @@ define i1 @test34(i32 %X) { define i1 @test35(i32 %X) { ; CHECK-LABEL: @test35( -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 %X, 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i32 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[TMP2]] ; %tmp1 = ashr i32 %X, 7 @@ -543,7 +543,7 @@ define i1 @test35(i32 %X) { define <2 x i1> @test35vec(<2 x i32> %X) { ; CHECK-LABEL: @test35vec( -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> %X, zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[X:%.*]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[TMP2]] ; %tmp1 = ashr <2 x i32> %X, @@ -553,7 +553,7 @@ define <2 x i1> @test35vec(<2 x i32> %X) { define i128 @test36(i128 %A, i128 %B) { ; CHECK-LABEL: @test36( -; CHECK-NEXT: [[TMP231:%.*]] = or i128 %B, %A +; CHECK-NEXT: [[TMP231:%.*]] = or i128 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[INS:%.*]] = and i128 [[TMP231]], 18446744073709551615 ; CHECK-NEXT: ret i128 [[INS]] ; @@ -566,9 +566,9 @@ define i128 @test36(i128 %A, i128 %B) { define i64 @test37(i128 %A, i32 %B) { ; CHECK-LABEL: @test37( -; CHECK-NEXT: [[TMP22:%.*]] = zext i32 %B to i128 +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[B:%.*]] to i128 ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i128 [[TMP22]], 32 -; CHECK-NEXT: [[INS:%.*]] = or i128 [[TMP23]], %A +; CHECK-NEXT: [[INS:%.*]] = or i128 [[TMP23]], [[A:%.*]] ; CHECK-NEXT: [[TMP46:%.*]] = trunc i128 [[INS]] to i64 ; CHECK-NEXT: ret i64 [[TMP46]] ; @@ -583,7 +583,7 @@ define i64 @test37(i128 %A, i32 %B) { define <2 x i32> @shl_nuw_nsw_splat_vec(<2 x i8> %x) { ; CHECK-LABEL: @shl_nuw_nsw_splat_vec( -; CHECK-NEXT: [[T2:%.*]] = zext <2 x i8> %x to <2 x i32> +; CHECK-NEXT: [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> ; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw <2 x i32> [[T2]], ; CHECK-NEXT: ret <2 x i32> [[T3]] ; @@ -594,7 +594,7 @@ define <2 x i32> @shl_nuw_nsw_splat_vec(<2 x i8> %x) { define i32 @test38(i32 %x) nounwind readnone { ; CHECK-LABEL: @test38( -; CHECK-NEXT: [[REM1:%.*]] = and i32 %x, 31 +; CHECK-NEXT: [[REM1:%.*]] = and i32 [[X:%.*]], 31 ; CHECK-NEXT: [[SHL:%.*]] = shl i32 1, [[REM1]] ; CHECK-NEXT: ret i32 [[SHL]] ; @@ -607,7 +607,7 @@ define i32 @test38(i32 %x) nounwind readnone { define i8 @test39(i32 %a0) { ; CHECK-LABEL: @test39( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 %a0 to i8 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[A0:%.*]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = shl i8 [[TMP4]], 5 ; CHECK-NEXT: [[TMP49:%.*]] = shl i8 [[TMP4]], 6 ; CHECK-NEXT: [[TMP50:%.*]] = and i8 [[TMP49]], 64 @@ -634,8 +634,8 @@ entry: ; PR9809 define i32 @test40(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: @test40( -; CHECK-NEXT: [[TMP1:%.*]] = add i32 %b, 2 -; CHECK-NEXT: [[DIV:%.*]] = lshr i32 %a, [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[B:%.*]], 2 +; CHECK-NEXT: [[DIV:%.*]] = lshr i32 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[DIV]] ; %shl1 = shl i32 1, %b @@ -646,7 +646,7 @@ define i32 @test40(i32 %a, i32 %b) nounwind { define i32 @test41(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: @test41( -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 8, %b +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 8, [[B:%.*]] ; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = shl i32 1, %b @@ -656,8 +656,8 @@ define i32 @test41(i32 %a, i32 %b) nounwind { define i32 @test42(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: @test42( -; CHECK-NEXT: [[DIV:%.*]] = lshr exact i32 4096, %b -; CHECK-NEXT: [[DIV2:%.*]] = udiv i32 %a, [[DIV]] +; CHECK-NEXT: [[DIV:%.*]] = lshr exact i32 4096, [[B:%.*]] +; CHECK-NEXT: [[DIV2:%.*]] = udiv i32 [[A:%.*]], [[DIV]] ; CHECK-NEXT: ret i32 [[DIV2]] ; %div = lshr i32 4096, %b ; must be exact otherwise we'd divide by zero @@ -667,8 +667,8 @@ define i32 @test42(i32 %a, i32 %b) nounwind { define <2 x i32> @test42vec(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @test42vec( -; CHECK-NEXT: [[DIV:%.*]] = lshr exact <2 x i32> , %b -; CHECK-NEXT: [[DIV2:%.*]] = udiv <2 x i32> %a, [[DIV]] +; CHECK-NEXT: [[DIV:%.*]] = lshr exact <2 x i32> , [[B:%.*]] +; CHECK-NEXT: [[DIV2:%.*]] = udiv <2 x i32> [[A:%.*]], [[DIV]] ; CHECK-NEXT: ret <2 x i32> [[DIV2]] ; %div = lshr <2 x i32> , %b ; must be exact otherwise we'd divide by zero @@ -678,8 +678,8 @@ define <2 x i32> @test42vec(<2 x i32> %a, <2 x i32> %b) { define i32 @test43(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: @test43( -; CHECK-NEXT: [[TMP1:%.*]] = add i32 %b, 12 -; CHECK-NEXT: [[DIV2:%.*]] = lshr i32 %a, [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[B:%.*]], 12 +; CHECK-NEXT: [[DIV2:%.*]] = lshr i32 [[A:%.*]], [[TMP1]] ; CHECK-NEXT: ret i32 [[DIV2]] ; %div = shl i32 4096, %b ; must be exact otherwise we'd divide by zero @@ -689,7 +689,7 @@ define i32 @test43(i32 %a, i32 %b) nounwind { define i32 @test44(i32 %a) nounwind { ; CHECK-LABEL: @test44( -; CHECK-NEXT: [[Y:%.*]] = shl i32 %a, 5 +; CHECK-NEXT: [[Y:%.*]] = shl i32 [[A:%.*]], 5 ; CHECK-NEXT: ret i32 [[Y]] ; %y = shl nuw i32 %a, 1 @@ -699,7 +699,7 @@ define i32 @test44(i32 %a) nounwind { define i32 @test45(i32 %a) nounwind { ; CHECK-LABEL: @test45( -; CHECK-NEXT: [[Y:%.*]] = lshr i32 %a, 5 +; CHECK-NEXT: [[Y:%.*]] = lshr i32 [[A:%.*]], 5 ; CHECK-NEXT: ret i32 [[Y]] ; %y = lshr exact i32 %a, 1 @@ -711,7 +711,7 @@ define i32 @test45(i32 %a) nounwind { define i32 @test46(i32 %a) { ; CHECK-LABEL: @test46( -; CHECK-NEXT: [[Z:%.*]] = ashr exact i32 %a, 2 +; CHECK-NEXT: [[Z:%.*]] = ashr exact i32 [[A:%.*]], 2 ; CHECK-NEXT: ret i32 [[Z]] ; %y = ashr exact i32 %a, 3 @@ -723,7 +723,7 @@ define i32 @test46(i32 %a) { define <2 x i32> @test46_splat_vec(<2 x i32> %a) { ; CHECK-LABEL: @test46_splat_vec( -; CHECK-NEXT: [[Z:%.*]] = ashr exact <2 x i32> %a, +; CHECK-NEXT: [[Z:%.*]] = ashr exact <2 x i32> [[A:%.*]], ; CHECK-NEXT: ret <2 x i32> [[Z]] ; %y = ashr exact <2 x i32> %a, @@ -735,7 +735,7 @@ define <2 x i32> @test46_splat_vec(<2 x i32> %a) { define i8 @test47(i8 %a) { ; CHECK-LABEL: @test47( -; CHECK-NEXT: [[Z:%.*]] = lshr exact i8 %a, 2 +; CHECK-NEXT: [[Z:%.*]] = lshr exact i8 [[A:%.*]], 2 ; CHECK-NEXT: ret i8 [[Z]] ; %y = lshr exact i8 %a, 3 @@ -747,7 +747,7 @@ define i8 @test47(i8 %a) { define <2 x i8> @test47_splat_vec(<2 x i8> %a) { ; CHECK-LABEL: @test47_splat_vec( -; CHECK-NEXT: [[Z:%.*]] = lshr exact <2 x i8> %a, +; CHECK-NEXT: [[Z:%.*]] = lshr exact <2 x i8> [[A:%.*]], ; CHECK-NEXT: ret <2 x i8> [[Z]] ; %y = lshr exact <2 x i8> %a, @@ -759,7 +759,7 @@ define <2 x i8> @test47_splat_vec(<2 x i8> %a) { define i32 @test48(i32 %x) { ; CHECK-LABEL: @test48( -; CHECK-NEXT: [[B:%.*]] = shl i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = shl i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = lshr exact i32 %x, 1 @@ -771,7 +771,7 @@ define i32 @test48(i32 %x) { define i32 @test48_nuw_nsw(i32 %x) { ; CHECK-LABEL: @test48_nuw_nsw( -; CHECK-NEXT: [[B:%.*]] = shl nuw nsw i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = shl nuw nsw i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = lshr exact i32 %x, 1 @@ -783,7 +783,7 @@ define i32 @test48_nuw_nsw(i32 %x) { define <2 x i32> @test48_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test48_splat_vec( -; CHECK-NEXT: [[B:%.*]] = shl nuw nsw <2 x i32> %x, +; CHECK-NEXT: [[B:%.*]] = shl nuw nsw <2 x i32> [[X:%.*]], ; CHECK-NEXT: ret <2 x i32> [[B]] ; %A = lshr exact <2 x i32> %x, @@ -795,7 +795,7 @@ define <2 x i32> @test48_splat_vec(<2 x i32> %x) { define i32 @test49(i32 %x) { ; CHECK-LABEL: @test49( -; CHECK-NEXT: [[B:%.*]] = shl i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = shl i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = ashr exact i32 %x, 1 @@ -807,7 +807,7 @@ define i32 @test49(i32 %x) { define i32 @test49_nuw_nsw(i32 %x) { ; CHECK-LABEL: @test49_nuw_nsw( -; CHECK-NEXT: [[B:%.*]] = shl nuw nsw i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = shl nuw nsw i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = ashr exact i32 %x, 1 @@ -819,7 +819,7 @@ define i32 @test49_nuw_nsw(i32 %x) { define <2 x i32> @test49_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test49_splat_vec( -; CHECK-NEXT: [[B:%.*]] = shl nuw nsw <2 x i32> %x, +; CHECK-NEXT: [[B:%.*]] = shl nuw nsw <2 x i32> [[X:%.*]], ; CHECK-NEXT: ret <2 x i32> [[B]] ; %A = ashr exact <2 x i32> %x, @@ -831,7 +831,7 @@ define <2 x i32> @test49_splat_vec(<2 x i32> %x) { define i32 @test50(i32 %x) { ; CHECK-LABEL: @test50( -; CHECK-NEXT: [[B:%.*]] = ashr i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = ashr i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = shl nsw i32 %x, 1 @@ -844,7 +844,7 @@ define i32 @test50(i32 %x) { define <2 x i32> @test50_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test50_splat_vec( -; CHECK-NEXT: [[B:%.*]] = ashr exact <2 x i32> %x, +; CHECK-NEXT: [[B:%.*]] = ashr exact <2 x i32> [[X:%.*]], ; CHECK-NEXT: ret <2 x i32> [[B]] ; %A = shl nsw <2 x i32> %x, @@ -856,7 +856,7 @@ define <2 x i32> @test50_splat_vec(<2 x i32> %x) { define i32 @test51(i32 %x) { ; CHECK-LABEL: @test51( -; CHECK-NEXT: [[B:%.*]] = lshr i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = lshr i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = shl nuw i32 %x, 1 @@ -869,7 +869,7 @@ define i32 @test51(i32 %x) { define <2 x i32> @test51_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test51_splat_vec( -; CHECK-NEXT: [[B:%.*]] = lshr exact <2 x i32> %x, +; CHECK-NEXT: [[B:%.*]] = lshr exact <2 x i32> [[X:%.*]], ; CHECK-NEXT: ret <2 x i32> [[B]] ; %A = shl nuw <2 x i32> %x, @@ -882,7 +882,7 @@ define <2 x i32> @test51_splat_vec(<2 x i32> %x) { define i32 @test51_no_nuw(i32 %x) { ; CHECK-LABEL: @test51_no_nuw( -; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 %x, 2 +; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i32 [[X:%.*]], 2 ; CHECK-NEXT: [[B:%.*]] = and i32 [[TMP1]], 536870911 ; CHECK-NEXT: ret i32 [[B]] ; @@ -895,7 +895,7 @@ define i32 @test51_no_nuw(i32 %x) { define <2 x i32> @test51_no_nuw_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test51_no_nuw_splat_vec( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> %x, +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[B:%.*]] = and <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i32> [[B]] ; @@ -908,7 +908,7 @@ define <2 x i32> @test51_no_nuw_splat_vec(<2 x i32> %x) { define i32 @test52(i32 %x) { ; CHECK-LABEL: @test52( -; CHECK-NEXT: [[B:%.*]] = shl nsw i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = shl nsw i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = shl nsw i32 %x, 3 @@ -920,7 +920,7 @@ define i32 @test52(i32 %x) { define <2 x i32> @test52_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test52_splat_vec( -; CHECK-NEXT: [[B:%.*]] = shl nsw <2 x i32> %x, +; CHECK-NEXT: [[B:%.*]] = shl nsw <2 x i32> [[X:%.*]], ; CHECK-NEXT: ret <2 x i32> [[B]] ; %A = shl nsw <2 x i32> %x, @@ -932,7 +932,7 @@ define <2 x i32> @test52_splat_vec(<2 x i32> %x) { define i32 @test53(i32 %x) { ; CHECK-LABEL: @test53( -; CHECK-NEXT: [[B:%.*]] = shl nuw i32 %x, 2 +; CHECK-NEXT: [[B:%.*]] = shl nuw i32 [[X:%.*]], 2 ; CHECK-NEXT: ret i32 [[B]] ; %A = shl nuw i32 %x, 3 @@ -944,7 +944,7 @@ define i32 @test53(i32 %x) { define <2 x i32> @test53_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test53_splat_vec( -; CHECK-NEXT: [[B:%.*]] = shl nuw <2 x i32> %x, +; CHECK-NEXT: [[B:%.*]] = shl nuw <2 x i32> [[X:%.*]], ; CHECK-NEXT: ret <2 x i32> [[B]] ; %A = shl nuw <2 x i32> %x, @@ -956,7 +956,7 @@ define <2 x i32> @test53_splat_vec(<2 x i32> %x) { define i8 @test53_no_nuw(i8 %x) { ; CHECK-LABEL: @test53_no_nuw( -; CHECK-NEXT: [[TMP1:%.*]] = shl i8 %x, 2 +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 2 ; CHECK-NEXT: [[B:%.*]] = and i8 [[TMP1]], 124 ; CHECK-NEXT: ret i8 [[B]] ; @@ -969,7 +969,7 @@ define i8 @test53_no_nuw(i8 %x) { define <2 x i8> @test53_no_nuw_splat_vec(<2 x i8> %x) { ; CHECK-LABEL: @test53_no_nuw_splat_vec( -; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i8> %x, +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[B:%.*]] = and <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i8> [[B]] ; @@ -980,7 +980,7 @@ define <2 x i8> @test53_no_nuw_splat_vec(<2 x i8> %x) { define i32 @test54(i32 %x) { ; CHECK-LABEL: @test54( -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 %x, 3 +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 3 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP1]], 16 ; CHECK-NEXT: ret i32 [[AND]] ; @@ -992,7 +992,7 @@ define i32 @test54(i32 %x) { define <2 x i32> @test54_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test54_splat_vec( -; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i32> %x, +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i32> [[AND]] ; @@ -1004,7 +1004,7 @@ define <2 x i32> @test54_splat_vec(<2 x i32> %x) { define i32 @test55(i32 %x) { ; CHECK-LABEL: @test55( -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 %x, 3 +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[X:%.*]], 3 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[TMP1]], 8 ; CHECK-NEXT: ret i32 [[OR]] ; @@ -1016,7 +1016,7 @@ define i32 @test55(i32 %x) { define i32 @test56(i32 %x) { ; CHECK-LABEL: @test56( -; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 %x, 1 +; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 [[X:%.*]], 1 ; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[SHR2]], 4 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL]], 7 ; CHECK-NEXT: ret i32 [[OR]] @@ -1029,8 +1029,8 @@ define i32 @test56(i32 %x) { define i32 @test57(i32 %x) { ; CHECK-LABEL: @test57( -; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 %x, 1 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[SHR1]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[X:%.*]], 1 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 4 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL]], 7 ; CHECK-NEXT: ret i32 [[OR]] ; @@ -1042,7 +1042,7 @@ define i32 @test57(i32 %x) { define i32 @test58(i32 %x) { ; CHECK-LABEL: @test58( -; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 %x, 3 +; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 3 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[TMP1]], 1 ; CHECK-NEXT: ret i32 [[OR]] ; @@ -1054,7 +1054,7 @@ define i32 @test58(i32 %x) { define <2 x i32> @test58_splat_vec(<2 x i32> %x) { ; CHECK-LABEL: @test58_splat_vec( -; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> %x, +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i32> [[OR]] ; @@ -1066,7 +1066,7 @@ define <2 x i32> @test58_splat_vec(<2 x i32> %x) { define i32 @test59(i32 %x) { ; CHECK-LABEL: @test59( -; CHECK-NEXT: [[SHR:%.*]] = ashr i32 %x, 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[X:%.*]], 4 ; CHECK-NEXT: [[SHL:%.*]] = shl nsw i32 [[SHR]], 1 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL]], 2 ; CHECK-NEXT: ret i32 [[OR]] @@ -1080,7 +1080,7 @@ define i32 @test59(i32 %x) { ; propagate "exact" trait define i32 @test60(i32 %x) { ; CHECK-LABEL: @test60( -; CHECK-NEXT: [[SHL:%.*]] = ashr exact i32 %x, 3 +; CHECK-NEXT: [[SHL:%.*]] = ashr exact i32 [[X:%.*]], 3 ; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL]], 1 ; CHECK-NEXT: ret i32 [[OR]] ; @@ -1094,19 +1094,19 @@ define i32 @test60(i32 %x) { define void @test61(i128 %arg) { ; CHECK-LABEL: @test61( ; CHECK-NEXT: bb: -; CHECK-NEXT: br i1 undef, label %bb1, label %bb12 +; CHECK-NEXT: br i1 undef, label [[BB1:%.*]], label [[BB12:%.*]] ; CHECK: bb1: -; CHECK-NEXT: br label %bb2 +; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: br i1 undef, label %bb3, label %bb7 +; CHECK-NEXT: br i1 undef, label [[BB3:%.*]], label [[BB7:%.*]] ; CHECK: bb3: -; CHECK-NEXT: br label %bb8 +; CHECK-NEXT: br label [[BB8:%.*]] ; CHECK: bb7: -; CHECK-NEXT: br i1 undef, label %bb8, label %bb2 +; CHECK-NEXT: br i1 undef, label [[BB8]], label [[BB2]] ; CHECK: bb8: -; CHECK-NEXT: br i1 undef, label %bb11, label %bb12 +; CHECK-NEXT: br i1 undef, label [[BB11:%.*]], label [[BB12]] ; CHECK: bb11: -; CHECK-NEXT: br i1 undef, label %bb1, label %bb12 +; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB12]] ; CHECK: bb12: ; CHECK-NEXT: ret void ; @@ -1159,7 +1159,7 @@ define <4 x i32> @test62_splat_vector(<4 x i32> %a) { define <4 x i32> @test62_non_splat_vector(<4 x i32> %a) { ; CHECK-LABEL: @test62_non_splat_vector( -; CHECK-NEXT: [[B:%.*]] = ashr <4 x i32> %a, +; CHECK-NEXT: [[B:%.*]] = ashr <4 x i32> [[A:%.*]], ; CHECK-NEXT: ret <4 x i32> [[B]] ; %b = ashr <4 x i32> %a, ; shift all bits out @@ -1168,7 +1168,7 @@ define <4 x i32> @test62_non_splat_vector(<4 x i32> %a) { define <2 x i65> @test_63(<2 x i64> %t) { ; CHECK-LABEL: @test_63( -; CHECK-NEXT: [[A:%.*]] = zext <2 x i64> %t to <2 x i65> +; CHECK-NEXT: [[A:%.*]] = zext <2 x i64> [[T:%.*]] to <2 x i65> ; CHECK-NEXT: [[SEXT:%.*]] = shl <2 x i65> [[A]], ; CHECK-NEXT: [[B:%.*]] = ashr exact <2 x i65> [[SEXT]], ; CHECK-NEXT: ret <2 x i65> [[B]] @@ -1294,7 +1294,7 @@ define i64 @shl_zext_mul_extra_use2(i32 %t) { define <2 x i8> @ashr_demanded_bits_splat(<2 x i8> %x) { ; CHECK-LABEL: @ashr_demanded_bits_splat( -; CHECK-NEXT: [[SHR:%.*]] = ashr <2 x i8> %x, +; CHECK-NEXT: [[SHR:%.*]] = ashr <2 x i8> [[X:%.*]], ; CHECK-NEXT: ret <2 x i8> [[SHR]] ; %and = and <2 x i8> %x, @@ -1304,7 +1304,7 @@ define <2 x i8> @ashr_demanded_bits_splat(<2 x i8> %x) { define <2 x i8> @lshr_demanded_bits_splat(<2 x i8> %x) { ; CHECK-LABEL: @lshr_demanded_bits_splat( -; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i8> %x, +; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i8> [[X:%.*]], ; CHECK-NEXT: ret <2 x i8> [[SHR]] ; %and = and <2 x i8> %x, diff --git a/llvm/test/Transforms/InstCombine/sign-bit-test-via-right-shifting-all-other-bits.ll b/llvm/test/Transforms/InstCombine/sign-bit-test-via-right-shifting-all-other-bits.ll new file mode 100644 index 00000000000000..c6507afab1f38b --- /dev/null +++ b/llvm/test/Transforms/InstCombine/sign-bit-test-via-right-shifting-all-other-bits.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +define i1 @highest_bit_test_via_lshr(i32 %data, i32 %nbits) { +; CHECK-LABEL: @highest_bit_test_via_lshr( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i32 [[DATA:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %num_low_bits_to_skip = sub i32 32, %nbits + %high_bits_extracted = lshr i32 %data, %num_low_bits_to_skip + %skip_all_bits_till_signbit = sub i32 %nbits, 1 + %signbit = lshr i32 %high_bits_extracted, %skip_all_bits_till_signbit + %isneg = icmp ne i32 %signbit, 0 + ret i1 %isneg +} + +define i1 @highest_bit_test_via_lshr_with_truncation(i64 %data, i32 %nbits) { +; CHECK-LABEL: @highest_bit_test_via_lshr_with_truncation( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i64 [[DATA:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %num_low_bits_to_skip = sub i32 64, %nbits + %num_low_bits_to_skip_wide = zext i32 %num_low_bits_to_skip to i64 + %high_bits_extracted = lshr i64 %data, %num_low_bits_to_skip_wide + %high_bits_extracted_narrow = trunc i64 %high_bits_extracted to i32 + %skip_all_bits_till_signbit = sub i32 %nbits, 1 + %signbit = lshr i32 %high_bits_extracted_narrow, %skip_all_bits_till_signbit + %isneg = icmp ne i32 %signbit, 0 + ret i1 %isneg +} + +define i1 @highest_bit_test_via_ashr(i32 %data, i32 %nbits) { +; CHECK-LABEL: @highest_bit_test_via_ashr( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i32 [[DATA:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %num_low_bits_to_skip = sub i32 32, %nbits + %high_bits_extracted = ashr i32 %data, %num_low_bits_to_skip + %skip_all_bits_till_signbit = sub i32 %nbits, 1 + %signbit = ashr i32 %high_bits_extracted, %skip_all_bits_till_signbit + %isneg = icmp ne i32 %signbit, 0 + ret i1 %isneg +} + +define i1 @highest_bit_test_via_ashr_with_truncation(i64 %data, i32 %nbits) { +; CHECK-LABEL: @highest_bit_test_via_ashr_with_truncation( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i64 [[DATA:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %num_low_bits_to_skip = sub i32 64, %nbits + %num_low_bits_to_skip_wide = zext i32 %num_low_bits_to_skip to i64 + %high_bits_extracted = ashr i64 %data, %num_low_bits_to_skip_wide + %high_bits_extracted_narrow = trunc i64 %high_bits_extracted to i32 + %skip_all_bits_till_signbit = sub i32 %nbits, 1 + %signbit = ashr i32 %high_bits_extracted_narrow, %skip_all_bits_till_signbit + %isneg = icmp ne i32 %signbit, 0 + ret i1 %isneg +} + +declare void @use32(i32) +declare void @use64(i64) + +define i1 @unsigned_sign_bit_extract(i32 %x) { +; CHECK-LABEL: @unsigned_sign_bit_extract( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i32 [[X:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signbit = lshr i32 %x, 31 + %isneg = icmp ne i32 %signbit, 0 + ret i1 %isneg +} +define i1 @unsigned_sign_bit_extract_extrause(i32 %x) { +; CHECK-LABEL: @unsigned_sign_bit_extract_extrause( +; CHECK-NEXT: [[SIGNBIT:%.*]] = lshr i32 [[X:%.*]], 31 +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT]]) +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i32 [[X]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signbit = lshr i32 %x, 31 + call void @use32(i32 %signbit) + %isneg = icmp ne i32 %signbit, 0 + ret i1 %isneg +} +define i1 @unsigned_sign_bit_extract_extrause__ispositive(i32 %x) { +; CHECK-LABEL: @unsigned_sign_bit_extract_extrause__ispositive( +; CHECK-NEXT: [[SIGNBIT:%.*]] = lshr i32 [[X:%.*]], 31 +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT]]) +; CHECK-NEXT: [[ISNEG:%.*]] = icmp sgt i32 [[X]], -1 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signbit = lshr i32 %x, 31 + call void @use32(i32 %signbit) + %isneg = icmp eq i32 %signbit, 0 + ret i1 %isneg +} +define i1 @signed_sign_bit_extract(i32 %x) { +; CHECK-LABEL: @signed_sign_bit_extract( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i32 [[X:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signsmear = ashr i32 %x, 31 + %isneg = icmp ne i32 %signsmear, 0 + ret i1 %isneg +} +define i1 @signed_sign_bit_extract_extrause(i32 %x) { +; CHECK-LABEL: @signed_sign_bit_extract_extrause( +; CHECK-NEXT: [[SIGNSMEAR:%.*]] = ashr i32 [[X:%.*]], 31 +; CHECK-NEXT: call void @use32(i32 [[SIGNSMEAR]]) +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i32 [[X]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signsmear = ashr i32 %x, 31 + call void @use32(i32 %signsmear) + %isneg = icmp ne i32 %signsmear, 0 + ret i1 %isneg +} +define i1 @unsigned_sign_bit_extract_with_trunc(i64 %x) { +; CHECK-LABEL: @unsigned_sign_bit_extract_with_trunc( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i64 [[X:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signbit = lshr i64 %x, 63 + %signbit_narrow = trunc i64 %signbit to i32 + %isneg = icmp ne i32 %signbit_narrow, 0 + ret i1 %isneg +} +define i1 @unsigned_sign_bit_extract_with_trunc_extrause(i64 %x) { +; CHECK-LABEL: @unsigned_sign_bit_extract_with_trunc_extrause( +; CHECK-NEXT: [[SIGNBIT:%.*]] = lshr i64 [[X:%.*]], 63 +; CHECK-NEXT: call void @use64(i64 [[SIGNBIT]]) +; CHECK-NEXT: [[SIGNBIT_NARROW:%.*]] = trunc i64 [[SIGNBIT]] to i32 +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_NARROW]]) +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i64 [[X]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signbit = lshr i64 %x, 63 + call void @use64(i64 %signbit) + %signbit_narrow = trunc i64 %signbit to i32 + call void @use32(i32 %signbit_narrow) + %isneg = icmp ne i32 %signbit_narrow, 0 + ret i1 %isneg +} +define i1 @signed_sign_bit_extract_trunc(i64 %x) { +; CHECK-LABEL: @signed_sign_bit_extract_trunc( +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i64 [[X:%.*]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signsmear = ashr i64 %x, 63 + %signsmear_narrow = trunc i64 %signsmear to i32 + %isneg = icmp ne i32 %signsmear_narrow, 0 + ret i1 %isneg +} +define i1 @signed_sign_bit_extract_trunc_extrause(i64 %x) { +; CHECK-LABEL: @signed_sign_bit_extract_trunc_extrause( +; CHECK-NEXT: [[SIGNSMEAR:%.*]] = ashr i64 [[X:%.*]], 63 +; CHECK-NEXT: call void @use64(i64 [[SIGNSMEAR]]) +; CHECK-NEXT: [[SIGNSMEAR_NARROW:%.*]] = trunc i64 [[SIGNSMEAR]] to i32 +; CHECK-NEXT: call void @use32(i32 [[SIGNSMEAR_NARROW]]) +; CHECK-NEXT: [[ISNEG:%.*]] = icmp slt i64 [[X]], 0 +; CHECK-NEXT: ret i1 [[ISNEG]] +; + %signsmear = ashr i64 %x, 63 + call void @use64(i64 %signsmear) + %signsmear_narrow = trunc i64 %signsmear to i32 + call void @use32(i32 %signsmear_narrow) + %isneg = icmp ne i32 %signsmear_narrow, 0 + ret i1 %isneg +} diff --git a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll new file mode 100644 index 00000000000000..2d9910352683ba --- /dev/null +++ b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +declare void @use8(i8) + +; Constant can be freely negated. +define i8 @t0(i8 %x) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[T0:%.*]] = add i8 [[X:%.*]], 42 +; CHECK-NEXT: ret i8 [[T0]] +; + %t0 = sub i8 %x, -42 + ret i8 %t0 +} + +; Negation can be negated for free +define i8 @t1(i8 %x, i8 %y) { +; CHECK-LABEL: @t1( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = add i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} + +; Shift-left can be negated if all uses can be updated +define i8 @t2(i8 %x, i8 %y) { +; CHECK-LABEL: @t2( +; CHECK-NEXT: [[T0:%.*]] = shl i8 -42, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = shl i8 -42, %y + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n2(i8 %x, i8 %y) { +; CHECK-LABEL: @n2( +; CHECK-NEXT: [[T0:%.*]] = shl i8 -42, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = shl i8 -42, %y + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @t3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t3( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = shl i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = shl i8 %t0, %y + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n3( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = shl i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = shl i8 %t0, %y + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Select can be negated if all it's operands can be negated and all the users of select can be updated +define i8 @t4(i8 %x, i1 %y) { +; CHECK-LABEL: @t4( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 44 +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 44 + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n4(i8 %x, i1 %y) { +; CHECK-LABEL: @n4( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 44 +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 44 + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n5(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @n5( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 %z + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @t6(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @t6( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = select i1 %y, i8 -42, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @t7(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @t7( +; CHECK-NEXT: [[T0:%.*]] = shl i8 1, [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = shl i8 1, %z + %t1 = select i1 %y, i8 0, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n8(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @n8( +; CHECK-NEXT: [[T0:%.*]] = shl i8 1, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = shl i8 1, %z + call void @use8(i8 %t0) + %t1 = select i1 %y, i8 0, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Subtraction can be negated if the first operand can be negated +; x - (y - z) -> x - y + z -> x + (-y) + z +define i8 @t9(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t9( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T11:%.*]] = add i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[T11]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %t0, %y + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n10(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n10( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %t0, %y + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n11(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n11( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = add i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %y, %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Addition can be negated if both operands can be negated +; x - (y + z) -> x - y - z -> x + ((-y) + (-z))) +define i8 @t12(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t12( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T3]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 0, %z + call void @use8(i8 %t1) + %t2 = add i8 %t0, %t1 + %t3 = sub i8 %x, %t2 + ret i8 %t3 +} +define i8 @n13(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n13( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T11:%.*]] = sub i8 [[Y]], [[Z:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[T11]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = add i8 %t0, %z + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n14(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n14( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 0, [[TMP1]] +; CHECK-NEXT: call void @use8(i8 [[T2]]) +; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T3]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 0, %z + call void @use8(i8 %t1) + %t2 = add i8 %t0, %t1 + call void @use8(i8 %t2) + %t3 = sub i8 %x, %t2 + ret i8 %t3 +} + +; Multiplication can be negated if either one of operands can be negated +; x - (y * z) -> x + ((-y) * z) or x + ((-z) * y) +define i8 @t15(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t15( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[TMP1:%.*]] = mul i8 [[Z:%.*]], [[Y]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = mul i8 %t0, %z + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n16(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n16( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = mul i8 [[T0]], [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = mul i8 %t0, %z + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} diff --git a/llvm/test/Transforms/InstCombine/unpack-fca.ll b/llvm/test/Transforms/InstCombine/unpack-fca.ll index 3c5e4177d69f95..1bfd53f40322a5 100644 --- a/llvm/test/Transforms/InstCombine/unpack-fca.ll +++ b/llvm/test/Transforms/InstCombine/unpack-fca.ll @@ -13,7 +13,7 @@ declare i32 @A.foo(%A* nocapture %this) define void @storeA(%A* %a.ptr) { ; CHECK-LABEL: storeA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds %A, %A* %a.ptr, i64 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr %A, %A* %a.ptr, i64 0, i32 0 ; CHECK-NEXT: store %A__vtbl* @A__vtblZ, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: ret void store %A { %A__vtbl* @A__vtblZ }, %A* %a.ptr, align 8 @@ -33,7 +33,7 @@ define void @storeB(%B* %b.ptr) { define void @storeStructOfA({ %A }* %sa.ptr) { ; CHECK-LABEL: storeStructOfA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds { %A }, { %A }* %sa.ptr, i64 0, i32 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr { %A }, { %A }* %sa.ptr, i64 0, i32 0, i32 0 ; CHECK-NEXT: store %A__vtbl* @A__vtblZ, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: ret void store { %A } { %A { %A__vtbl* @A__vtblZ } }, { %A }* %sa.ptr, align 8 @@ -42,7 +42,7 @@ define void @storeStructOfA({ %A }* %sa.ptr) { define void @storeArrayOfA([1 x %A]* %aa.ptr) { ; CHECK-LABEL: storeArrayOfA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds [1 x %A], [1 x %A]* %aa.ptr, i64 0, i64 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr [1 x %A], [1 x %A]* %aa.ptr, i64 0, i64 0, i32 0 ; CHECK-NEXT: store %A__vtbl* @A__vtblZ, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: ret void store [1 x %A] [%A { %A__vtbl* @A__vtblZ }], [1 x %A]* %aa.ptr, align 8 @@ -60,7 +60,7 @@ define void @storeLargeArrayOfA([2000 x %A]* %aa.ptr) { define void @storeStructOfArrayOfA({ [1 x %A] }* %saa.ptr) { ; CHECK-LABEL: storeStructOfArrayOfA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds { [1 x %A] }, { [1 x %A] }* %saa.ptr, i64 0, i32 0, i64 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr { [1 x %A] }, { [1 x %A] }* %saa.ptr, i64 0, i32 0, i64 0, i32 0 ; CHECK-NEXT: store %A__vtbl* @A__vtblZ, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: ret void store { [1 x %A] } { [1 x %A] [%A { %A__vtbl* @A__vtblZ }] }, { [1 x %A] }* %saa.ptr, align 8 @@ -90,7 +90,7 @@ define void @storeArrayOfB([2 x %B]* %ab.ptr, [2 x %B] %ab) { define %A @loadA(%A* %a.ptr) { ; CHECK-LABEL: loadA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds %A, %A* %a.ptr, i64 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr %A, %A* %a.ptr, i64 0, i32 0 ; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %A__vtbl*, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: [[IV:%[a-z0-9\.]+]] = insertvalue %A undef, %A__vtbl* [[LOAD]], 0 ; CHECK-NEXT: ret %A [[IV]] @@ -113,7 +113,7 @@ define %B @loadB(%B* %b.ptr) { define { %A } @loadStructOfA({ %A }* %sa.ptr) { ; CHECK-LABEL: loadStructOfA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds { %A }, { %A }* %sa.ptr, i64 0, i32 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr { %A }, { %A }* %sa.ptr, i64 0, i32 0, i32 0 ; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %A__vtbl*, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: [[IV1:%[a-z0-9\.]+]] = insertvalue %A undef, %A__vtbl* [[LOAD]], 0 ; CHECK-NEXT: [[IV2:%[a-z0-9\.]+]] = insertvalue { %A } undef, %A [[IV1]], 0 @@ -124,7 +124,7 @@ define { %A } @loadStructOfA({ %A }* %sa.ptr) { define [1 x %A] @loadArrayOfA([1 x %A]* %aa.ptr) { ; CHECK-LABEL: loadArrayOfA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds [1 x %A], [1 x %A]* %aa.ptr, i64 0, i64 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr [1 x %A], [1 x %A]* %aa.ptr, i64 0, i64 0, i32 0 ; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %A__vtbl*, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: [[IV1:%[a-z0-9\.]+]] = insertvalue %A undef, %A__vtbl* [[LOAD]], 0 ; CHECK-NEXT: [[IV2:%[a-z0-9\.]+]] = insertvalue [1 x %A] undef, %A [[IV1]], 0 @@ -135,7 +135,7 @@ define [1 x %A] @loadArrayOfA([1 x %A]* %aa.ptr) { define { [1 x %A] } @loadStructOfArrayOfA({ [1 x %A] }* %saa.ptr) { ; CHECK-LABEL: loadStructOfArrayOfA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds { [1 x %A] }, { [1 x %A] }* %saa.ptr, i64 0, i32 0, i64 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr { [1 x %A] }, { [1 x %A] }* %saa.ptr, i64 0, i32 0, i64 0, i32 0 ; CHECK-NEXT: [[LOAD:%[a-z0-9\.]+]] = load %A__vtbl*, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: [[IV1:%[a-z0-9\.]+]] = insertvalue %A undef, %A__vtbl* [[LOAD]], 0 ; CHECK-NEXT: [[IV2:%[a-z0-9\.]+]] = insertvalue [1 x %A] undef, %A [[IV1]], 0 @@ -147,7 +147,7 @@ define { [1 x %A] } @loadStructOfArrayOfA({ [1 x %A] }* %saa.ptr) { define { %A } @structOfA({ %A }* %sa.ptr) { ; CHECK-LABEL: structOfA -; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds { %A }, { %A }* %sa.ptr, i64 0, i32 0, i32 0 +; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr { %A }, { %A }* %sa.ptr, i64 0, i32 0, i32 0 ; CHECK-NEXT: store %A__vtbl* @A__vtblZ, %A__vtbl** [[GEP]], align 8 ; CHECK-NEXT: ret { %A } { %A { %A__vtbl* @A__vtblZ } } store { %A } { %A { %A__vtbl* @A__vtblZ } }, { %A }* %sa.ptr, align 8 diff --git a/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll b/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll new file mode 100644 index 00000000000000..a5f38735a37382 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll @@ -0,0 +1,575 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +declare void @use16(i16) +declare void @use32(i32) +declare void @use64(i64) + +define i32 @t0(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t0_zext_of_nbits(i64 %data, i8 %nbits_narrow) { +; CHECK-LABEL: @t0_zext_of_nbits( +; CHECK-NEXT: [[NBITS:%.*]] = zext i8 [[NBITS_NARROW:%.*]] to i16 +; CHECK-NEXT: call void @use16(i16 [[NBITS]]) +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub nsw i16 64, [[NBITS]] +; CHECK-NEXT: call void @use16(i16 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i16 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW:%.*]] = sub nsw i16 32, [[NBITS]] +; CHECK-NEXT: call void @use16(i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = zext i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]] to i32 +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %nbits = zext i8 %nbits_narrow to i16 + call void @use16(i16 %nbits) + %skip_high = sub i16 64, %nbits + call void @use16(i16 %skip_high) + %skip_high_wide = zext i16 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow_narrow = sub i16 32, %nbits + call void @use16(i16 %num_high_bits_to_smear_narrow_narrow) + %num_high_bits_to_smear_narrow = zext i16 %num_high_bits_to_smear_narrow_narrow to i32 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t0_exact(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t0_exact( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr exact i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr exact i64 %data, %skip_high_wide ; We can preserve `exact`-ness of the original shift. + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @t1_redundant_sext(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t1_redundant_sext( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED_WITH_SIGNEXTENSION:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED_WITH_SIGNEXTENSION]]) +; CHECK-NEXT: [[EXTRACTED_WITH_SIGNEXTENSION_NARROW:%.*]] = trunc i64 [[EXTRACTED_WITH_SIGNEXTENSION]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: ret i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted_with_signextension = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted_with_signextension) + %extracted_with_signextension_narrow = trunc i64 %extracted_with_signextension to i32 ; this is already the answer. + call void @use32(i32 %extracted_with_signextension_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_with_signextension_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i64 @t2_notrunc(i64 %data, i64 %nbits) { +; CHECK-LABEL: @t2_notrunc( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i64 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR:%.*]] = sub i64 64, [[NBITS]] +; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH]] +; CHECK-NEXT: ret i64 [[SIGNEXTENDED]] +; + %skip_high = sub i64 64, %nbits + call void @use64(i64 %skip_high) + %extracted = lshr i64 %data, %skip_high + call void @use64(i64 %extracted) + %num_high_bits_to_smear = sub i64 64, %nbits + call void @use64(i64 %num_high_bits_to_smear) + %signbit_positioned = shl i64 %extracted, %num_high_bits_to_smear ; + call void @use64(i64 %signbit_positioned) + %signextended = ashr i64 %signbit_positioned, %num_high_bits_to_smear ; can just shift %data itself. + ret i64 %signextended +} + +define i64 @t3_notrunc_redundant_sext(i64 %data, i64 %nbits) { +; CHECK-LABEL: @t3_notrunc_redundant_sext( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i64 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR:%.*]] = sub i64 64, [[NBITS]] +; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: ret i64 [[EXTRACTED]] +; + %skip_high = sub i64 64, %nbits + call void @use64(i64 %skip_high) + %extracted = ashr i64 %data, %skip_high ; this is already the answer. + call void @use64(i64 %extracted) + %num_high_bits_to_smear = sub i64 64, %nbits + call void @use64(i64 %num_high_bits_to_smear) + %signbit_positioned = shl i64 %extracted, %num_high_bits_to_smear + call void @use64(i64 %signbit_positioned) + %signextended = ashr i64 %signbit_positioned, %num_high_bits_to_smear + ret i64 %signextended +} + +define <2 x i32> @t4_vec(<2 x i64> %data, <2 x i32> %nbits) { +; CHECK-LABEL: @t4_vec( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <2 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <2 x i32> [[SKIP_HIGH]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[SIGNEXTENDED]] +; + %skip_high = sub <2 x i32> , %nbits + %skip_high_wide = zext <2 x i32> %skip_high to <2 x i64> + %extracted = lshr <2 x i64> %data, %skip_high_wide + %extracted_narrow = trunc <2 x i64> %extracted to <2 x i32> + %num_high_bits_to_smear_narrow = sub <2 x i32> , %nbits + %signbit_positioned = shl <2 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr <2 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow + ret <2 x i32> %signextended +} + +define <3 x i32> @t5_vec_undef(<3 x i64> %data, <3 x i32> %nbits) { +; CHECK-LABEL: @t5_vec_undef( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <3 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <3 x i32> [[SKIP_HIGH]] to <3 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc <3 x i64> [[TMP1]] to <3 x i32> +; CHECK-NEXT: ret <3 x i32> [[SIGNEXTENDED]] +; + %skip_high = sub <3 x i32> , %nbits + %skip_high_wide = zext <3 x i32> %skip_high to <3 x i64> + %extracted = lshr <3 x i64> %data, %skip_high_wide + %extracted_narrow = trunc <3 x i64> %extracted to <3 x i32> + %num_high_bits_to_smear_narrow0 = sub <3 x i32> , %nbits + %num_high_bits_to_smear_narrow1 = sub <3 x i32> , %nbits + %signbit_positioned = shl <3 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow0 + %signextended = ashr <3 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow1 + ret <3 x i32> %signextended +} + +; Extra-uses +define i32 @t6_extrause_good0(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t6_extrause_good0( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow ; will go away + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t7_extrause_good1(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t7_extrause_good1( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW0:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow0 = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow0) + %num_high_bits_to_smear_narrow1 = sub i32 32, %nbits ; will go away. + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow0 + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow1 + ret i32 %signextended +} +define i32 @n8_extrause_bad(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n8_extrause_bad( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow ; neither of operands will go away. + ret i32 %signextended +} + +; Negative tests +define i32 @n9(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n9( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 63, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 63, %nbits ; not 64 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n10(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n10( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 31, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 31, %nbits ; not 32 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n11(i64 %data, i32 %nbits1, i32 %nbits2) { +; CHECK-LABEL: @n11( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS1:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS2:%.*]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits1 ; not %nbits2 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits2 ; not %nbits1 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n12(i64 %data, i32 %nbits1, i32 %nbits2) { +; CHECK-LABEL: @n12( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS1:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW1:%.*]] = sub i32 32, [[NBITS1]] +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW2:%.*]] = sub i32 32, [[NBITS2:%.*]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]]) +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW2]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW2]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits1 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow1 = sub i32 32, %nbits1 ; not %nbits2 + %num_high_bits_to_smear_narrow2 = sub i32 32, %nbits2 ; not %nbits1 + call void @use32(i32 %num_high_bits_to_smear_narrow1) + call void @use32(i32 %num_high_bits_to_smear_narrow2) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow1 + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow2 + ret i32 %signextended +} + +define i32 @n13(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n13( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 -1, [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[RES:%.*]] = and i32 [[TMP1]], [[EXTRACTED_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n13_extrause(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n13_extrause( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[HIGHBITS_CLEANED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[HIGHBITS_CLEANED]]) +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[HIGHBITS_CLEANED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %highbits_cleaned) + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n14(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n14( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 -1, [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[RES:%.*]] = and i32 [[TMP1]], [[EXTRACTED_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n14_extrause(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n14_extrause( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[HIGHBITS_CLEANED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[HIGHBITS_CLEANED]]) +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[HIGHBITS_CLEANED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %highbits_cleaned) + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll index 3083e60cd90576..2fc0841a8e15b9 100644 --- a/llvm/test/Transforms/InstSimplify/call.ll +++ b/llvm/test/Transforms/InstSimplify/call.ll @@ -747,8 +747,7 @@ declare double @llvm.fmuladd.f64(double,double,double) define double @fma_undef_op0(double %x, double %y) { ; CHECK-LABEL: @fma_undef_op0( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fma.f64(double undef, double [[X:%.*]], double [[Y:%.*]]) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %r = call double @llvm.fma.f64(double undef, double %x, double %y) ret double %r @@ -756,8 +755,7 @@ define double @fma_undef_op0(double %x, double %y) { define double @fma_undef_op1(double %x, double %y) { ; CHECK-LABEL: @fma_undef_op1( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fma.f64(double [[X:%.*]], double undef, double [[Y:%.*]]) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %r = call double @llvm.fma.f64(double %x, double undef, double %y) ret double %r @@ -765,8 +763,7 @@ define double @fma_undef_op1(double %x, double %y) { define double @fma_undef_op2(double %x, double %y) { ; CHECK-LABEL: @fma_undef_op2( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double undef) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %r = call double @llvm.fma.f64(double %x, double %y, double undef) ret double %r @@ -774,8 +771,7 @@ define double @fma_undef_op2(double %x, double %y) { define double @fmuladd_undef_op0(double %x, double %y) { ; CHECK-LABEL: @fmuladd_undef_op0( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fmuladd.f64(double undef, double [[X:%.*]], double [[Y:%.*]]) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %r = call double @llvm.fmuladd.f64(double undef, double %x, double %y) ret double %r @@ -783,8 +779,7 @@ define double @fmuladd_undef_op0(double %x, double %y) { define double @fmuladd_undef_op1(double %x, double %y) { ; CHECK-LABEL: @fmuladd_undef_op1( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fmuladd.f64(double [[X:%.*]], double undef, double [[Y:%.*]]) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %r = call double @llvm.fmuladd.f64(double %x, double undef, double %y) ret double %r @@ -792,8 +787,7 @@ define double @fmuladd_undef_op1(double %x, double %y) { define double @fmuladd_undef_op2(double %x, double %y) { ; CHECK-LABEL: @fmuladd_undef_op2( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fmuladd.f64(double [[X:%.*]], double [[Y:%.*]], double undef) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %r = call double @llvm.fmuladd.f64(double %x, double %y, double undef) ret double %r @@ -801,8 +795,7 @@ define double @fmuladd_undef_op2(double %x, double %y) { define double @fma_nan_op0(double %x, double %y) { ; CHECK-LABEL: @fma_nan_op0( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fma.f64(double 0x7FF8000000000000, double [[X:%.*]], double [[Y:%.*]]) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %r = call double @llvm.fma.f64(double 0x7ff8000000000000, double %x, double %y) ret double %r @@ -810,8 +803,7 @@ define double @fma_nan_op0(double %x, double %y) { define double @fma_nan_op1(double %x, double %y) { ; CHECK-LABEL: @fma_nan_op1( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fma.f64(double [[X:%.*]], double 0x7FF8000000000001, double [[Y:%.*]]) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000001 ; %r = call double @llvm.fma.f64(double %x, double 0x7ff8000000000001, double %y) ret double %r @@ -819,8 +811,7 @@ define double @fma_nan_op1(double %x, double %y) { define double @fma_nan_op2(double %x, double %y) { ; CHECK-LABEL: @fma_nan_op2( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double 0x7FF8000000000002) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000000002 ; %r = call double @llvm.fma.f64(double %x, double %y, double 0x7ff8000000000002) ret double %r @@ -828,8 +819,7 @@ define double @fma_nan_op2(double %x, double %y) { define double @fmuladd_nan_op0_op1(double %x) { ; CHECK-LABEL: @fmuladd_nan_op0_op1( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fmuladd.f64(double 0x7FF8000000001234, double 0x7FF800000000DEAD, double [[X:%.*]]) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000001234 ; %r = call double @llvm.fmuladd.f64(double 0x7ff8000000001234, double 0x7ff800000000dead, double %x) ret double %r @@ -837,8 +827,7 @@ define double @fmuladd_nan_op0_op1(double %x) { define double @fmuladd_nan_op0_op2(double %x) { ; CHECK-LABEL: @fmuladd_nan_op0_op2( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fmuladd.f64(double 0x7FF8000000005678, double [[X:%.*]], double 0x7FF800000000DEAD) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF8000000005678 ; %r = call double @llvm.fmuladd.f64(double 0x7ff8000000005678, double %x, double 0x7ff800000000dead) ret double %r @@ -846,8 +835,7 @@ define double @fmuladd_nan_op0_op2(double %x) { define double @fmuladd_nan_op1_op2(double %x) { ; CHECK-LABEL: @fmuladd_nan_op1_op2( -; CHECK-NEXT: [[R:%.*]] = call double @llvm.fmuladd.f64(double [[X:%.*]], double 0x7FF80000AAAAAAAA, double 0x7FF800000000DEAD) -; CHECK-NEXT: ret double [[R]] +; CHECK-NEXT: ret double 0x7FF80000AAAAAAAA ; %r = call double @llvm.fmuladd.f64(double %x, double 0x7ff80000aaaaaaaa, double 0x7ff800000000dead) ret double %r diff --git a/llvm/test/Transforms/LICM/guards.ll b/llvm/test/Transforms/LICM/guards.ll index 2873c89d09230a..2343e0917c568c 100644 --- a/llvm/test/Transforms/LICM/guards.ll +++ b/llvm/test/Transforms/LICM/guards.ll @@ -84,7 +84,7 @@ loop: } ; But can hoist if the side effect is hoisted with MSSA -define void @test2b_prime(i1 %cond, i32* %ptr) { +define void @test2b_prime(i1 %cond, i32* noalias %ptr) { ; MSSA-LABEL: @test2b_prime( ; MSSA-NEXT: entry: ; MSSA-NEXT: [[P2:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll new file mode 100644 index 00000000000000..90f0ae4cd865c4 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; This is a bugpoint reduction of a test from PR43582: +; https://bugs.llvm.org/show_bug.cgi?id=43582 + +; ...but it's over-simplifying the underlying question: +; TODO: Should this be vectorized rather than allowing the backend to load combine? +; The original code is a bswap pattern. + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-w64-windows-gnu" + +define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 { +; CHECK-LABEL: @cff_index_load_offsets( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]] +; CHECK: if.then: +; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[X:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* null, i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa !1 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i8> [[TMP9]], i8 [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i8> [[TMP10]], i8 [[TMP7]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw <4 x i32> [[TMP12]], +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* undef, align 1, !tbaa !1 +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* undef, align 1, !tbaa !1 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* undef, align 1, !tbaa !1 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* undef, align 1, !tbaa !1 +; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i32 0 +; CHECK-NEXT: store i32 [[TMP21]], i32* undef, align 4, !tbaa !4 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP20]], i32 1 +; CHECK-NEXT: store i32 [[TMP22]], i32* undef, align 4, !tbaa !4 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP20]], i32 2 +; CHECK-NEXT: store i32 [[TMP23]], i32* undef, align 4, !tbaa !4 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP20]], i32 3 +; CHECK-NEXT: store i32 [[TMP24]], i32* undef, align 4, !tbaa !4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1, 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ null, [[MIDDLE_BLOCK]] ], [ null, [[IF_THEN]] ] +; CHECK-NEXT: br label [[FOR_BODY68:%.*]] +; CHECK: for.body68: +; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X]] to i32 +; CHECK-NEXT: [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24 +; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[P]], align 1, !tbaa !1 +; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP26]] to i32 +; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16 +; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]] +; CHECK-NEXT: [[TMP27:%.*]] = load i8, i8* undef, align 1, !tbaa !1 +; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8 +; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]] +; CHECK-NEXT: [[CONV81:%.*]] = zext i8 undef to i32 +; CHECK-NEXT: [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]] +; CHECK-NEXT: store i32 [[OR83]], i32* undef, align 4, !tbaa !4 +; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, i8* [[P_359]], i64 4 +; CHECK-NEXT: [[CMP66:%.*]] = icmp ult i8* [[ADD_PTR86]], undef +; CHECK-NEXT: br i1 [[CMP66]], label [[FOR_BODY68]], label [[SW_EPILOG]], !llvm.loop !8 +; CHECK: sw.epilog: +; CHECK-NEXT: unreachable +; CHECK: Exit: +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %if.then, label %Exit + +if.then: ; preds = %entry + br label %for.body68 + +for.body68: ; preds = %for.body68, %if.then + %p.359 = phi i8* [ %add.ptr86, %for.body68 ], [ null, %if.then ] + %conv70 = zext i8 %x to i32 + %shl71 = shl nuw i32 %conv70, 24 + %0 = load i8, i8* %p, align 1, !tbaa !1 + %conv73 = zext i8 %0 to i32 + %shl74 = shl nuw nsw i32 %conv73, 16 + %or75 = or i32 %shl74, %shl71 + %1 = load i8, i8* undef, align 1, !tbaa !1 + %shl78 = shl nuw nsw i32 undef, 8 + %or79 = or i32 %or75, %shl78 + %conv81 = zext i8 undef to i32 + %or83 = or i32 %or79, %conv81 + store i32 %or83, i32* undef, align 4, !tbaa !4 + %add.ptr86 = getelementptr inbounds i8, i8* %p.359, i64 4 + %cmp66 = icmp ult i8* %add.ptr86, undef + br i1 %cmp66, label %for.body68, label %sw.epilog + +sw.epilog: ; preds = %for.body68 + unreachable + +Exit: ; preds = %entry + ret void +} + +attributes #0 = { "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 0fedc26a0dc0066f3968b9fea6a4e1f746c8d5a4)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{!5, !5, i64 0} +!5 = !{!"long", !2, i64 0} diff --git a/llvm/test/Transforms/LowerTypeTests/export-rename-local.ll b/llvm/test/Transforms/LowerTypeTests/export-rename-local.ll new file mode 100644 index 00000000000000..9e7c54ee94a396 --- /dev/null +++ b/llvm/test/Transforms/LowerTypeTests/export-rename-local.ll @@ -0,0 +1,15 @@ +; RUN: opt -S %s -lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/exported-funcs.yaml | FileCheck %s + +; CHECK: define internal void @external_addrtaken.1() +; CHECK: declare {{.*}} void @external_addrtaken.cfi() + +target triple = "x86_64-unknown-linux" + +define internal void @external_addrtaken() !type !1 { + ret void +} + +!cfi.functions = !{!0} + +!0 = !{!"external_addrtaken", i8 0, !1} +!1 = !{i64 0, !"typeid1"} diff --git a/llvm/test/Transforms/PGOProfile/chr.ll b/llvm/test/Transforms/PGOProfile/chr.ll index 0221a4fd000c4e..5b723df46c6428 100644 --- a/llvm/test/Transforms/PGOProfile/chr.ll +++ b/llvm/test/Transforms/PGOProfile/chr.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -chr -instcombine -simplifycfg -S | FileCheck %s ; RUN: opt < %s -passes='require,function(chr,instcombine,simplify-cfg)' -S | FileCheck %s @@ -468,14 +470,15 @@ define i32 @test_chr_5(i32* %i, i32 %sum0) !prof !14 { ; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP11]], i32 [[SUM1_NONCHR]], i32 [[TMP12]], !prof !16 ; CHECK-NEXT: [[TMP13:%.*]] = and i32 [[TMP0]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[BB3]], label [[BB1_NONCHR:%.*]], !prof !16 +; CHECK: bb1.nonchr: ; CHECK-NEXT: [[TMP15:%.*]] = and i32 [[TMP0]], 8 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0 -; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP16]], i32 44, i32 88 +; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP16]], i32 44, i32 88, !prof !16 ; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]] -; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP14]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ] +; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM2_NONCHR]], [[BB0_NONCHR]] ], [ [[SUM4_NONCHR]], [[BB1_NONCHR]] ] ; CHECK-NEXT: ret i32 [[SUM6]] ; entry: @@ -568,14 +571,15 @@ define i32 @test_chr_5_1(i32* %i, i32 %sum0) !prof !14 { ; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP14]], i32 [[SUM1_NONCHR]], i32 [[TMP15]], !prof !16 ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[SUM0]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-NEXT: br i1 [[TMP17]], label [[BB3]], label [[BB1_NONCHR:%.*]], !prof !16 +; CHECK: bb1.nonchr: ; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP0]], 8 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], 0 -; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP19]], i32 44, i32 88 +; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP19]], i32 44, i32 88, !prof !16 ; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]] -; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP17]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP7]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ] +; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP7]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM2_NONCHR]], [[BB0_NONCHR]] ], [ [[SUM4_NONCHR]], [[BB1_NONCHR]] ] ; CHECK-NEXT: ret i32 [[SUM6]] ; entry: @@ -665,14 +669,15 @@ define i32 @test_chr_6(i32* %i, i32* %j, i32 %sum0) !prof !14 { ; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[V4_NONCHR]], i32 [[SUM0]], i32 [[V8_NONCHR]], !prof !16 ; CHECK-NEXT: [[V9_NONCHR:%.*]] = and i32 [[J0]], 4 ; CHECK-NEXT: [[V10_NONCHR:%.*]] = icmp eq i32 [[V9_NONCHR]], 0 +; CHECK-NEXT: br i1 [[V10_NONCHR]], label [[BB3]], label [[BB1_NONCHR:%.*]], !prof !16 +; CHECK: bb1.nonchr: ; CHECK-NEXT: [[V11_NONCHR:%.*]] = and i32 [[I0]], 8 ; CHECK-NEXT: [[V12_NONCHR:%.*]] = icmp eq i32 [[V11_NONCHR]], 0 -; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[V12_NONCHR]], i32 44, i32 88 +; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[V12_NONCHR]], i32 44, i32 88, !prof !16 ; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]] -; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[V10_NONCHR]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[V13]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ] +; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[V13]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM2_NONCHR]], [[BB0_NONCHR]] ], [ [[SUM4_NONCHR]], [[BB1_NONCHR]] ] ; CHECK-NEXT: ret i32 [[SUM6]] ; entry: @@ -1751,14 +1756,15 @@ define i32 @test_chr_19(i32* %i, i32 %sum0) !prof !14 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[SUM0]], 85 ; CHECK-NEXT: [[SUM2_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM0]], i32 [[TMP9]], !prof !16 +; CHECK-NEXT: br i1 [[TMP8]], label [[BB3]], label [[BB1_NONCHR:%.*]], !prof !16 +; CHECK: bb1.nonchr: ; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP0]], 8 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 -; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP11]], i32 44, i32 88 +; CHECK-NEXT: [[SUM4_NONCHR_V:%.*]] = select i1 [[TMP11]], i32 44, i32 88, !prof !16 ; CHECK-NEXT: [[SUM4_NONCHR:%.*]] = add i32 [[SUM2_NONCHR]], [[SUM4_NONCHR_V]] -; CHECK-NEXT: [[SUM5_NONCHR:%.*]] = select i1 [[TMP8]], i32 [[SUM2_NONCHR]], i32 [[SUM4_NONCHR]], !prof !16 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM5_NONCHR]], [[BB0_NONCHR]] ] +; CHECK-NEXT: [[SUM6:%.*]] = phi i32 [ [[TMP4]], [[BB0]] ], [ [[SUM0]], [[ENTRY_SPLIT_NONCHR]] ], [ [[SUM2_NONCHR]], [[BB0_NONCHR]] ], [ [[SUM4_NONCHR]], [[BB1_NONCHR]] ] ; CHECK-NEXT: ret i32 [[SUM6]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll b/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll new file mode 100644 index 00000000000000..87f82dfe4e78f0 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mattr=sse2 -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE +; RUN: opt < %s -mattr=avx2 -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX + +; TODO: +; With AVX, we are able to vectorize the 1st 4 elements as 256-bit vector ops, +; but the final 2 elements remain scalar. They should get vectorized using +; 128-bit ops identically to what happens with SSE. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @PR28457(double* noalias nocapture align 32 %q, double* noalias nocapture readonly align 32 %p) { +; SSE-LABEL: @PR28457( +; SSE-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 0 +; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds double, double* [[P]], i64 1 +; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2 +; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3 +; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds double, double* [[P]], i64 4 +; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds double, double* [[P]], i64 5 +; SSE-NEXT: [[Q0:%.*]] = getelementptr inbounds double, double* [[Q:%.*]], i64 0 +; SSE-NEXT: [[Q1:%.*]] = getelementptr inbounds double, double* [[Q]], i64 1 +; SSE-NEXT: [[Q2:%.*]] = getelementptr inbounds double, double* [[Q]], i64 2 +; SSE-NEXT: [[Q3:%.*]] = getelementptr inbounds double, double* [[Q]], i64 3 +; SSE-NEXT: [[Q4:%.*]] = getelementptr inbounds double, double* [[Q]], i64 4 +; SSE-NEXT: [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5 +; SSE-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>* +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; SSE-NEXT: [[TMP3:%.*]] = bitcast double* [[P2]] to <2 x double>* +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 +; SSE-NEXT: [[TMP5:%.*]] = bitcast double* [[P4]] to <2 x double>* +; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 +; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP2]], +; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], +; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP6]], +; SSE-NEXT: [[TMP10:%.*]] = bitcast double* [[Q0]] to <2 x double>* +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP10]], align 8 +; SSE-NEXT: [[TMP11:%.*]] = bitcast double* [[Q2]] to <2 x double>* +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP11]], align 8 +; SSE-NEXT: [[TMP12:%.*]] = bitcast double* [[Q4]] to <2 x double>* +; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP12]], align 8 +; SSE-NEXT: ret void +; +; AVX-LABEL: @PR28457( +; AVX-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 0 +; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds double, double* [[P]], i64 1 +; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2 +; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3 +; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds double, double* [[P]], i64 4 +; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds double, double* [[P]], i64 5 +; AVX-NEXT: [[Q0:%.*]] = getelementptr inbounds double, double* [[Q:%.*]], i64 0 +; AVX-NEXT: [[Q1:%.*]] = getelementptr inbounds double, double* [[Q]], i64 1 +; AVX-NEXT: [[Q2:%.*]] = getelementptr inbounds double, double* [[Q]], i64 2 +; AVX-NEXT: [[Q3:%.*]] = getelementptr inbounds double, double* [[Q]], i64 3 +; AVX-NEXT: [[Q4:%.*]] = getelementptr inbounds double, double* [[Q]], i64 4 +; AVX-NEXT: [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5 +; AVX-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <4 x double>* +; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8 +; AVX-NEXT: [[D4:%.*]] = load double, double* [[P4]] +; AVX-NEXT: [[D5:%.*]] = load double, double* [[P5]] +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], +; AVX-NEXT: [[A4:%.*]] = fadd double [[D4]], 1.000000e+00 +; AVX-NEXT: [[A5:%.*]] = fadd double [[D5]], 1.000000e+00 +; AVX-NEXT: [[TMP4:%.*]] = bitcast double* [[Q0]] to <4 x double>* +; AVX-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8 +; AVX-NEXT: store double [[A4]], double* [[Q4]] +; AVX-NEXT: store double [[A5]], double* [[Q5]] +; AVX-NEXT: ret void +; + %p0 = getelementptr inbounds double, double* %p, i64 0 + %p1 = getelementptr inbounds double, double* %p, i64 1 + %p2 = getelementptr inbounds double, double* %p, i64 2 + %p3 = getelementptr inbounds double, double* %p, i64 3 + %p4 = getelementptr inbounds double, double* %p, i64 4 + %p5 = getelementptr inbounds double, double* %p, i64 5 + + %q0 = getelementptr inbounds double, double* %q, i64 0 + %q1 = getelementptr inbounds double, double* %q, i64 1 + %q2 = getelementptr inbounds double, double* %q, i64 2 + %q3 = getelementptr inbounds double, double* %q, i64 3 + %q4 = getelementptr inbounds double, double* %q, i64 4 + %q5 = getelementptr inbounds double, double* %q, i64 5 + + %d0 = load double, double* %p0 + %d1 = load double, double* %p1 + %d2 = load double, double* %p2 + %d3 = load double, double* %p3 + %d4 = load double, double* %p4 + %d5 = load double, double* %p5 + + %a0 = fadd double %d0, 1.0 + %a1 = fadd double %d1, 1.0 + %a2 = fadd double %d2, 1.0 + %a3 = fadd double %d3, 1.0 + %a4 = fadd double %d4, 1.0 + %a5 = fadd double %d5, 1.0 + + store double %a0, double* %q0 + store double %a1, double* %q1 + store double %a2, double* %q2 + store double %a3, double* %q3 + store double %a4, double* %q4 + store double %a5, double* %q5 + ret void +} diff --git a/llvm/test/Transforms/SampleProfile/compressed-profile-symbol-list.ll b/llvm/test/Transforms/SampleProfile/compressed-profile-symbol-list.ll index 66265677229938..6ac62c1701b87c 100644 --- a/llvm/test/Transforms/SampleProfile/compressed-profile-symbol-list.ll +++ b/llvm/test/Transforms/SampleProfile/compressed-profile-symbol-list.ll @@ -1,5 +1,5 @@ ; REQUIRES: zlib ; Append inline.prof with profile symbol list and save it after compression. -; RUN: llvm-profdata merge --sample --prof-sym-list=%S/Inputs/profile-symbol-list.text --compress-prof-sym-list=true --extbinary %S/Inputs/inline.prof --output=%t.profdata +; RUN: llvm-profdata merge --sample --prof-sym-list=%S/Inputs/profile-symbol-list.text --compress-all-sections=true --extbinary %S/Inputs/inline.prof --output=%t.profdata ; RUN: opt < %S/Inputs/profile-symbol-list.ll -sample-profile -profile-accurate-for-symsinlist -sample-profile-file=%t.profdata -S | FileCheck %S/Inputs/profile-symbol-list.ll ; RUN: opt < %S/Inputs/profile-symbol-list.ll -passes=sample-profile -profile-accurate-for-symsinlist -sample-profile-file=%t.profdata -S | FileCheck %S/Inputs/profile-symbol-list.ll diff --git a/llvm/test/Transforms/SampleProfile/profile-format-compress.ll b/llvm/test/Transforms/SampleProfile/profile-format-compress.ll new file mode 100644 index 00000000000000..be4eae3cddcb0f --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/profile-format-compress.ll @@ -0,0 +1,123 @@ +; REQUIRES: zlib +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -S | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof -S | FileCheck %s +; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections %S/Inputs/inline.prof -o %t.compress.extbinary.afdo +; RUN: opt < %s -sample-profile -sample-profile-file=%t.compress.extbinary.afdo -S | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t.compress.extbinary.afdo -S | FileCheck %s + +; Original C++ test case +; +; #include +; +; int sum(int x, int y) { +; return x + y; +; } +; +; int main() { +; int s, i = 0; +; while (i++ < 20000 * 20000) +; if (i != 100) s = sum(i, s); else s = 30; +; printf("sum is %d\n", s); +; return 0; +; } +; +@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1 + +; Check sample-profile phase using compressed extbinary format profile +; will annotate the IR with exactly the same result as using text format. +; CHECK: br i1 %cmp, label %while.body, label %while.end{{.*}} !prof ![[IDX1:[0-9]*]] +; CHECK: br i1 %cmp1, label %if.then, label %if.else{{.*}} !prof ![[IDX2:[0-9]*]] +; CHECK: call i32 (i8*, ...) @printf{{.*}} !prof ![[IDX3:[0-9]*]] +; CHECK: = !{!"TotalCount", i64 26781} +; CHECK: = !{!"MaxCount", i64 5553} +; CHECK: ![[IDX1]] = !{!"branch_weights", i32 5392, i32 163} +; CHECK: ![[IDX2]] = !{!"branch_weights", i32 5280, i32 113} +; CHECK: ![[IDX3]] = !{!"branch_weights", i32 1} + +; Function Attrs: nounwind uwtable +define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !4 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %0 = load i32, i32* %x.addr, align 4, !dbg !11 + %1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %0, %1, !dbg !11 + ret i32 %add, !dbg !11 +} + +; Function Attrs: uwtable +define i32 @main() !dbg !7 { +entry: + %retval = alloca i32, align 4 + %s = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %retval + store i32 0, i32* %i, align 4, !dbg !12 + br label %while.cond, !dbg !13 + +while.cond: ; preds = %if.end, %entry + %0 = load i32, i32* %i, align 4, !dbg !14 + %inc = add nsw i32 %0, 1, !dbg !14 + store i32 %inc, i32* %i, align 4, !dbg !14 + %cmp = icmp slt i32 %0, 400000000, !dbg !14 + br i1 %cmp, label %while.body, label %while.end, !dbg !14 + +while.body: ; preds = %while.cond + %1 = load i32, i32* %i, align 4, !dbg !16 + %cmp1 = icmp ne i32 %1, 100, !dbg !16 + br i1 %cmp1, label %if.then, label %if.else, !dbg !16 + + +if.then: ; preds = %while.body + %2 = load i32, i32* %i, align 4, !dbg !18 + %3 = load i32, i32* %s, align 4, !dbg !18 + %call = call i32 @_Z3sumii(i32 %2, i32 %3), !dbg !18 + store i32 %call, i32* %s, align 4, !dbg !18 + br label %if.end, !dbg !18 + +if.else: ; preds = %while.body + store i32 30, i32* %s, align 4, !dbg !20 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %while.cond, !dbg !22 + +while.end: ; preds = %while.cond + %4 = load i32, i32* %s, align 4, !dbg !24 + %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %4), !dbg !24 + ret i32 0, !dbg !25 +} + +declare i32 @printf(i8*, ...) #2 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!4 = distinct !DISubprogram(name: "sum", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2) +!5 = !DIFile(filename: "calls.cc", directory: ".") +!6 = !DISubroutineType(types: !2) +!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, retainedNodes: !2) +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = !{!"clang version 3.5 "} +!11 = !DILocation(line: 4, scope: !4) +!12 = !DILocation(line: 8, scope: !7) +!13 = !DILocation(line: 9, scope: !7) +!14 = !DILocation(line: 9, scope: !15) +!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7) +!16 = !DILocation(line: 10, scope: !17) +!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7) +!18 = !DILocation(line: 10, scope: !19) +!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17) +!20 = !DILocation(line: 10, scope: !21) +!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17) +!22 = !DILocation(line: 10, scope: !23) +!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17) +!24 = !DILocation(line: 11, scope: !7) +!25 = !DILocation(line: 12, scope: !7) diff --git a/llvm/test/Transforms/SampleProfile/uncompressed-profile-symbol-list.ll b/llvm/test/Transforms/SampleProfile/uncompressed-profile-symbol-list.ll index abe562d7ebbe31..5eaf4b279d73e4 100644 --- a/llvm/test/Transforms/SampleProfile/uncompressed-profile-symbol-list.ll +++ b/llvm/test/Transforms/SampleProfile/uncompressed-profile-symbol-list.ll @@ -1,4 +1,4 @@ ; Append inline.prof with profile symbol list and save it without compression. -; RUN: llvm-profdata merge --sample --prof-sym-list=%S/Inputs/profile-symbol-list.text --compress-prof-sym-list=false --extbinary %S/Inputs/inline.prof --output=%t.profdata +; RUN: llvm-profdata merge --sample --prof-sym-list=%S/Inputs/profile-symbol-list.text --compress-all-sections=false --extbinary %S/Inputs/inline.prof --output=%t.profdata ; RUN: opt < %S/Inputs/profile-symbol-list.ll -sample-profile -profile-accurate-for-symsinlist -sample-profile-file=%t.profdata -S | FileCheck %S/Inputs/profile-symbol-list.ll ; RUN: opt < %S/Inputs/profile-symbol-list.ll -passes=sample-profile -profile-accurate-for-symsinlist -sample-profile-file=%t.profdata -S | FileCheck %S/Inputs/profile-symbol-list.ll diff --git a/llvm/test/Transforms/SimplifyCFG/PhiEliminate3.ll b/llvm/test/Transforms/SimplifyCFG/PhiEliminate3.ll index 4d7435ce3c2540..ee876c6e223a67 100644 --- a/llvm/test/Transforms/SimplifyCFG/PhiEliminate3.ll +++ b/llvm/test/Transforms/SimplifyCFG/PhiEliminate3.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -simplifycfg -S -phi-node-folding-threshold=1 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK-ONE ; RUN: opt < %s -simplifycfg -S -phi-node-folding-threshold=2 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK-TWO @@ -7,21 +9,55 @@ ; with various folding thresholds define i32 @test(i1 %a, i1 %b, i32 %i, i32 %j, i32 %k) { -; ALL-LABEL: @test( -; ALL-NEXT: entry: -; ALL-NEXT: br i1 [[A:%.*]], label [[M:%.*]], label [[O:%.*]] -; ALL: O: -; ALL-NEXT: [[IAJ:%.*]] = add i32 [[I:%.*]], [[J:%.*]] -; ALL-NEXT: [[IAJAK:%.*]] = add i32 [[IAJ]], [[K:%.*]] -; ALL-NEXT: [[IXJ:%.*]] = xor i32 [[I]], [[J]] -; ALL-NEXT: [[IXJXK:%.*]] = xor i32 [[IXJ]], [[K]] -; ALL-NEXT: [[WP:%.*]] = select i1 [[B:%.*]], i32 [[IAJAK]], i32 [[IXJXK]] -; ALL-NEXT: [[WP2:%.*]] = add i32 [[WP]], [[WP]] -; ALL-NEXT: br label [[M]] -; ALL: M: -; ALL-NEXT: [[W:%.*]] = phi i32 [ [[WP2]], [[O]] ], [ 2, [[ENTRY:%.*]] ] -; ALL-NEXT: [[R:%.*]] = add i32 [[W]], 1 -; ALL-NEXT: ret i32 [[R]] +; CHECK-ONE-LABEL: @test( +; CHECK-ONE-NEXT: entry: +; CHECK-ONE-NEXT: br i1 [[A:%.*]], label [[M:%.*]], label [[O:%.*]] +; CHECK-ONE: O: +; CHECK-ONE-NEXT: br i1 [[B:%.*]], label [[P:%.*]], label [[Q:%.*]] +; CHECK-ONE: P: +; CHECK-ONE-NEXT: [[IAJ:%.*]] = add i32 [[I:%.*]], [[J:%.*]] +; CHECK-ONE-NEXT: [[IAJAK:%.*]] = add i32 [[IAJ]], [[K:%.*]] +; CHECK-ONE-NEXT: br label [[N:%.*]] +; CHECK-ONE: Q: +; CHECK-ONE-NEXT: [[IXJ:%.*]] = xor i32 [[I]], [[J]] +; CHECK-ONE-NEXT: [[IXJXK:%.*]] = xor i32 [[IXJ]], [[K]] +; CHECK-ONE-NEXT: br label [[N]] +; CHECK-ONE: N: +; CHECK-ONE-NEXT: [[WP:%.*]] = phi i32 [ [[IAJAK]], [[P]] ], [ [[IXJXK]], [[Q]] ] +; CHECK-ONE-NEXT: [[WP2:%.*]] = add i32 [[WP]], [[WP]] +; CHECK-ONE-NEXT: br label [[M]] +; CHECK-ONE: M: +; CHECK-ONE-NEXT: [[W:%.*]] = phi i32 [ [[WP2]], [[N]] ], [ 2, [[ENTRY:%.*]] ] +; CHECK-ONE-NEXT: [[R:%.*]] = add i32 [[W]], 1 +; CHECK-ONE-NEXT: ret i32 [[R]] +; +; CHECK-TWO-LABEL: @test( +; CHECK-TWO-NEXT: entry: +; CHECK-TWO-NEXT: br i1 [[A:%.*]], label [[M:%.*]], label [[O:%.*]] +; CHECK-TWO: O: +; CHECK-TWO-NEXT: [[IAJ:%.*]] = add i32 [[I:%.*]], [[J:%.*]] +; CHECK-TWO-NEXT: [[IAJAK:%.*]] = add i32 [[IAJ]], [[K:%.*]] +; CHECK-TWO-NEXT: [[IXJ:%.*]] = xor i32 [[I]], [[J]] +; CHECK-TWO-NEXT: [[IXJXK:%.*]] = xor i32 [[IXJ]], [[K]] +; CHECK-TWO-NEXT: [[WP:%.*]] = select i1 [[B:%.*]], i32 [[IAJAK]], i32 [[IXJXK]] +; CHECK-TWO-NEXT: [[WP2:%.*]] = add i32 [[WP]], [[WP]] +; CHECK-TWO-NEXT: br label [[M]] +; CHECK-TWO: M: +; CHECK-TWO-NEXT: [[W:%.*]] = phi i32 [ [[WP2]], [[O]] ], [ 2, [[ENTRY:%.*]] ] +; CHECK-TWO-NEXT: [[R:%.*]] = add i32 [[W]], 1 +; CHECK-TWO-NEXT: ret i32 [[R]] +; +; CHECK-SEVEN-LABEL: @test( +; CHECK-SEVEN-NEXT: entry: +; CHECK-SEVEN-NEXT: [[IAJ:%.*]] = add i32 [[I:%.*]], [[J:%.*]] +; CHECK-SEVEN-NEXT: [[IAJAK:%.*]] = add i32 [[IAJ]], [[K:%.*]] +; CHECK-SEVEN-NEXT: [[IXJ:%.*]] = xor i32 [[I]], [[J]] +; CHECK-SEVEN-NEXT: [[IXJXK:%.*]] = xor i32 [[IXJ]], [[K]] +; CHECK-SEVEN-NEXT: [[WP:%.*]] = select i1 [[B:%.*]], i32 [[IAJAK]], i32 [[IXJXK]] +; CHECK-SEVEN-NEXT: [[WP2:%.*]] = add i32 [[WP]], [[WP]] +; CHECK-SEVEN-NEXT: [[W:%.*]] = select i1 [[A:%.*]], i32 2, i32 [[WP2]] +; CHECK-SEVEN-NEXT: [[R:%.*]] = add i32 [[W]], 1 +; CHECK-SEVEN-NEXT: ret i32 [[R]] ; entry: br i1 %a, label %M, label %O diff --git a/llvm/test/Transforms/SimplifyCFG/SpeculativeExec.ll b/llvm/test/Transforms/SimplifyCFG/SpeculativeExec.ll index c21edd0c2ad8fa..0e833c8d8e6a5e 100644 --- a/llvm/test/Transforms/SimplifyCFG/SpeculativeExec.ll +++ b/llvm/test/Transforms/SimplifyCFG/SpeculativeExec.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -simplifycfg -phi-node-folding-threshold=2 -S | FileCheck %s @@ -8,10 +10,14 @@ define i32 @test1(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[B:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[BB1:%.*]], label [[BB3:%.*]] +; CHECK: bb1: ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[C:%.*]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[A:%.*]], 1 ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 [[A]] -; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP1]], i32 [[SPEC_SELECT]], i32 [[B]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[BB1]] ] ; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[TMP4]], 1 ; CHECK-NEXT: ret i32 [[TMP5]] ; diff --git a/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll b/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll index 3a2f067a2ee32a..9be1b16357a1df 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/speculate-cttz-ctlz.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -simplifycfg -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s --check-prefix=ALL --check-prefix=BMI ; RUN: opt -S -simplifycfg -mtriple=x86_64-unknown-unknown -mattr=+lzcnt < %s | FileCheck %s --check-prefix=ALL --check-prefix=LZCNT @@ -223,13 +225,37 @@ cond.end: ; preds = %entry, %cond.true ; for the target. define i64 @test1e(i32 %x) { -; ALL-LABEL: @test1e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) -; ALL-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i64 32, i64 [[PHITMP2]] -; ALL-NEXT: ret i64 [[COND]] +; BMI-LABEL: @test1e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; BMI-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) +; BMI-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 +; BMI-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i64 32, i64 [[PHITMP2]] +; BMI-NEXT: ret i64 [[COND]] +; +; LZCNT-LABEL: @test1e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; LZCNT-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; LZCNT: cond.true: +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) +; LZCNT-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 +; LZCNT-NEXT: br label [[COND_END]] +; LZCNT: cond.end: +; LZCNT-NEXT: [[COND:%.*]] = phi i64 [ [[PHITMP2]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; LZCNT-NEXT: ret i64 [[COND]] +; +; GENERIC-LABEL: @test1e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) +; GENERIC-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i64 [ [[PHITMP2]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i64 [[COND]] ; entry: %tobool = icmp eq i32 %x, 0 @@ -246,13 +272,37 @@ cond.end: ; preds = %entry, %cond.true } define i32 @test2e(i64 %x) { -; ALL-LABEL: @test2e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) -; ALL-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i32 64, i32 [[CAST]] -; ALL-NEXT: ret i32 [[COND]] +; BMI-LABEL: @test2e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; BMI-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) +; BMI-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 +; BMI-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i32 64, i32 [[CAST]] +; BMI-NEXT: ret i32 [[COND]] +; +; LZCNT-LABEL: @test2e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; LZCNT-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; LZCNT: cond.true: +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) +; LZCNT-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 +; LZCNT-NEXT: br label [[COND_END]] +; LZCNT: cond.end: +; LZCNT-NEXT: [[COND:%.*]] = phi i32 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; LZCNT-NEXT: ret i32 [[COND]] +; +; GENERIC-LABEL: @test2e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) +; GENERIC-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i32 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i32 [[COND]] ; entry: %tobool = icmp eq i64 %x, 0 @@ -269,13 +319,37 @@ cond.end: ; preds = %entry, %cond.true } define i64 @test3e(i32 %x) { -; ALL-LABEL: @test3e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) -; ALL-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i64 32, i64 [[PHITMP2]] -; ALL-NEXT: ret i64 [[COND]] +; BMI-LABEL: @test3e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; BMI-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; BMI: cond.true: +; BMI-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) +; BMI-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 +; BMI-NEXT: br label [[COND_END]] +; BMI: cond.end: +; BMI-NEXT: [[COND:%.*]] = phi i64 [ [[PHITMP2]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; BMI-NEXT: ret i64 [[COND]] +; +; LZCNT-LABEL: @test3e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) +; LZCNT-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 +; LZCNT-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i64 32, i64 [[PHITMP2]] +; LZCNT-NEXT: ret i64 [[COND]] +; +; GENERIC-LABEL: @test3e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) +; GENERIC-NEXT: [[PHITMP2:%.*]] = zext i32 [[TMP0]] to i64 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i64 [ [[PHITMP2]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i64 [[COND]] ; entry: %tobool = icmp eq i32 %x, 0 @@ -292,13 +366,37 @@ cond.end: ; preds = %entry, %cond.true } define i32 @test4e(i64 %x) { -; ALL-LABEL: @test4e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) -; ALL-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i32 64, i32 [[CAST]] -; ALL-NEXT: ret i32 [[COND]] +; BMI-LABEL: @test4e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; BMI-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; BMI: cond.true: +; BMI-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) +; BMI-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 +; BMI-NEXT: br label [[COND_END]] +; BMI: cond.end: +; BMI-NEXT: [[COND:%.*]] = phi i32 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; BMI-NEXT: ret i32 [[COND]] +; +; LZCNT-LABEL: @test4e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) +; LZCNT-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 +; LZCNT-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i32 64, i32 [[CAST]] +; LZCNT-NEXT: ret i32 [[COND]] +; +; GENERIC-LABEL: @test4e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) +; GENERIC-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i32 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i32 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i32 [[COND]] ; entry: %tobool = icmp eq i64 %x, 0 @@ -315,13 +413,37 @@ cond.end: ; preds = %entry, %cond.true } define i16 @test5e(i64 %x) { -; ALL-LABEL: @test5e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) -; ALL-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 64, i16 [[CAST]] -; ALL-NEXT: ret i16 [[COND]] +; BMI-LABEL: @test5e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; BMI-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; BMI: cond.true: +; BMI-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) +; BMI-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 +; BMI-NEXT: br label [[COND_END]] +; BMI: cond.end: +; BMI-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; BMI-NEXT: ret i16 [[COND]] +; +; LZCNT-LABEL: @test5e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) +; LZCNT-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 +; LZCNT-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 64, i16 [[CAST]] +; LZCNT-NEXT: ret i16 [[COND]] +; +; GENERIC-LABEL: @test5e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[X]], i1 true) +; GENERIC-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i16 [[COND]] ; entry: %tobool = icmp eq i64 %x, 0 @@ -338,13 +460,37 @@ cond.end: ; preds = %entry, %cond.true } define i16 @test6e(i32 %x) { -; ALL-LABEL: @test6e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) -; ALL-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]] -; ALL-NEXT: ret i16 [[COND]] +; BMI-LABEL: @test6e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; BMI-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; BMI: cond.true: +; BMI-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) +; BMI-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; BMI-NEXT: br label [[COND_END]] +; BMI: cond.end: +; BMI-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; BMI-NEXT: ret i16 [[COND]] +; +; LZCNT-LABEL: @test6e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) +; LZCNT-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; LZCNT-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]] +; LZCNT-NEXT: ret i16 [[COND]] +; +; GENERIC-LABEL: @test6e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[X]], i1 true) +; GENERIC-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i16 [[COND]] ; entry: %tobool = icmp eq i32 %x, 0 @@ -361,13 +507,37 @@ cond.end: ; preds = %entry, %cond.true } define i16 @test7e(i64 %x) { -; ALL-LABEL: @test7e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) -; ALL-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 64, i16 [[CAST]] -; ALL-NEXT: ret i16 [[COND]] +; BMI-LABEL: @test7e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; BMI-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) +; BMI-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 +; BMI-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 64, i16 [[CAST]] +; BMI-NEXT: ret i16 [[COND]] +; +; LZCNT-LABEL: @test7e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; LZCNT-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; LZCNT: cond.true: +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) +; LZCNT-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 +; LZCNT-NEXT: br label [[COND_END]] +; LZCNT: cond.end: +; LZCNT-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; LZCNT-NEXT: ret i16 [[COND]] +; +; GENERIC-LABEL: @test7e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[X]], i1 true) +; GENERIC-NEXT: [[CAST:%.*]] = trunc i64 [[TMP0]] to i16 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 64, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i16 [[COND]] ; entry: %tobool = icmp eq i64 %x, 0 @@ -384,13 +554,37 @@ cond.end: ; preds = %entry, %cond.true } define i16 @test8e(i32 %x) { -; ALL-LABEL: @test8e( -; ALL-NEXT: entry: -; ALL-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 -; ALL-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) -; ALL-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 -; ALL-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]] -; ALL-NEXT: ret i16 [[COND]] +; BMI-LABEL: @test8e( +; BMI-NEXT: entry: +; BMI-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; BMI-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) +; BMI-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; BMI-NEXT: [[COND:%.*]] = select i1 [[TOBOOL]], i16 32, i16 [[CAST]] +; BMI-NEXT: ret i16 [[COND]] +; +; LZCNT-LABEL: @test8e( +; LZCNT-NEXT: entry: +; LZCNT-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; LZCNT-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; LZCNT: cond.true: +; LZCNT-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) +; LZCNT-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; LZCNT-NEXT: br label [[COND_END]] +; LZCNT: cond.end: +; LZCNT-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; LZCNT-NEXT: ret i16 [[COND]] +; +; GENERIC-LABEL: @test8e( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X:%.*]], 0 +; GENERIC-NEXT: br i1 [[TOBOOL]], label [[COND_END:%.*]], label [[COND_TRUE:%.*]] +; GENERIC: cond.true: +; GENERIC-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[X]], i1 true) +; GENERIC-NEXT: [[CAST:%.*]] = trunc i32 [[TMP0]] to i16 +; GENERIC-NEXT: br label [[COND_END]] +; GENERIC: cond.end: +; GENERIC-NEXT: [[COND:%.*]] = phi i16 [ [[CAST]], [[COND_TRUE]] ], [ 32, [[ENTRY:%.*]] ] +; GENERIC-NEXT: ret i16 [[COND]] ; entry: %tobool = icmp eq i32 %x, 0 diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll index 597b5b969a739c..b8e093246ce691 100644 --- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll +++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -simplifycfg -switch-to-lookup=true -keep-loops=false -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s ; RUN: opt < %s -passes='simplify-cfg' -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s @@ -1437,10 +1439,14 @@ define i32 @no_reuse_cmp2(i32 %x, i32 %y) { ; CHECK-LABEL: @no_reuse_cmp2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[Y:%.*]], 0 +; CHECK-NEXT: br i1 [[EC]], label [[SWITCH_ENTRY:%.*]], label [[SW_EPILOG:%.*]] +; CHECK: switch.entry: ; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 4 ; CHECK-NEXT: [[SWITCH_OFFSET:%.*]] = add i32 [[X]], 10 ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP0]], i32 [[SWITCH_OFFSET]], i32 0 -; CHECK-NEXT: [[R_0:%.*]] = select i1 [[EC]], i32 [[SPEC_SELECT]], i32 100 +; CHECK-NEXT: br label [[SW_EPILOG]] +; CHECK: sw.epilog: +; CHECK-NEXT: [[R_0:%.*]] = phi i32 [ 100, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[SWITCH_ENTRY]] ] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[R_0]], 0 ; CHECK-NEXT: [[DOTR_0:%.*]] = select i1 [[CMP]], i32 100, i32 [[R_0]] ; CHECK-NEXT: ret i32 [[DOTR_0]] diff --git a/llvm/test/Transforms/SimplifyCFG/safe-abs.ll b/llvm/test/Transforms/SimplifyCFG/safe-abs.ll index 6d8028f8d9433a..ddc3f7d538aef1 100644 --- a/llvm/test/Transforms/SimplifyCFG/safe-abs.ll +++ b/llvm/test/Transforms/SimplifyCFG/safe-abs.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -simplifycfg -S | FileCheck %s @@ -8,10 +10,14 @@ define i32 @abs_with_clamp(i32 %arg) { ; CHECK-LABEL: @abs_with_clamp( ; CHECK-NEXT: begin: ; CHECK-NEXT: [[IS_POSITIVE:%.*]] = icmp sgt i32 [[ARG:%.*]], 0 +; CHECK-NEXT: br i1 [[IS_POSITIVE]], label [[END:%.*]], label [[NEGATIVE:%.*]] +; CHECK: negative: ; CHECK-NEXT: [[IS_INT_MIN:%.*]] = icmp eq i32 [[ARG]], -2147483648 ; CHECK-NEXT: [[NEGATED:%.*]] = sub nsw i32 0, [[ARG]] ; CHECK-NEXT: [[ABS:%.*]] = select i1 [[IS_INT_MIN]], i32 2147483647, i32 [[NEGATED]] -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[IS_POSITIVE]], i32 [[ARG]], i32 [[ABS]] +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[ARG]], [[BEGIN:%.*]] ], [ [[ABS]], [[NEGATIVE]] ] ; CHECK-NEXT: ret i32 [[TMP6]] ; begin: diff --git a/llvm/test/Transforms/SimplifyCFG/safe-low-bit-extract.ll b/llvm/test/Transforms/SimplifyCFG/safe-low-bit-extract.ll index 19a6313aa1d0dd..4a8d35c90feca6 100644 --- a/llvm/test/Transforms/SimplifyCFG/safe-low-bit-extract.ll +++ b/llvm/test/Transforms/SimplifyCFG/safe-low-bit-extract.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -simplifycfg -S | FileCheck %s @@ -9,10 +11,14 @@ define i32 @extract_low_bits(i32 %input, i32 %nbits) { ; CHECK-LABEL: @extract_low_bits( ; CHECK-NEXT: begin: ; CHECK-NEXT: [[SHOULD_MASK:%.*]] = icmp ult i32 [[NBITS:%.*]], 32 +; CHECK-NEXT: br i1 [[SHOULD_MASK]], label [[PERFORM_MASKING:%.*]], label [[END:%.*]] +; CHECK: perform_masking: ; CHECK-NEXT: [[MASK_NOT:%.*]] = shl nsw i32 -1, [[NBITS]] ; CHECK-NEXT: [[MASK:%.*]] = xor i32 [[MASK_NOT]], -1 ; CHECK-NEXT: [[MASKED:%.*]] = and i32 [[MASK]], [[INPUT:%.*]] -; CHECK-NEXT: [[RES:%.*]] = select i1 [[SHOULD_MASK]], i32 [[MASKED]], i32 [[INPUT]] +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[MASKED]], [[PERFORM_MASKING]] ], [ [[INPUT]], [[BEGIN:%.*]] ] ; CHECK-NEXT: ret i32 [[RES]] ; begin: diff --git a/llvm/test/Transforms/SimplifyCFG/signbit-like-value-extension.ll b/llvm/test/Transforms/SimplifyCFG/signbit-like-value-extension.ll index e955e0679dc123..55596941a1ef33 100644 --- a/llvm/test/Transforms/SimplifyCFG/signbit-like-value-extension.ll +++ b/llvm/test/Transforms/SimplifyCFG/signbit-like-value-extension.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -simplifycfg -S | FileCheck %s @@ -11,10 +13,14 @@ define i32 @extend_value(i32 %storage, i32 %nbits) { ; CHECK-NEXT: [[SKIPNBITS:%.*]] = sub i32 32, [[NBITS:%.*]] ; CHECK-NEXT: [[VALUE:%.*]] = lshr i32 [[STORAGE:%.*]], [[SKIPNBITS]] ; CHECK-NEXT: [[SHOULDEXTEND:%.*]] = icmp sgt i32 [[STORAGE]], -1 +; CHECK-NEXT: br i1 [[SHOULDEXTEND]], label [[EXTEND:%.*]], label [[END:%.*]] +; CHECK: extend: ; CHECK-NEXT: [[HIGHBITMASK:%.*]] = shl nsw i32 -1, [[NBITS]] ; CHECK-NEXT: [[HIGHBITMASKPLUSONE:%.*]] = add nsw i32 [[HIGHBITMASK]], 1 ; CHECK-NEXT: [[EXTENDED:%.*]] = add i32 [[HIGHBITMASKPLUSONE]], [[VALUE]] -; CHECK-NEXT: [[RES:%.*]] = select i1 [[SHOULDEXTEND]], i32 [[EXTENDED]], i32 [[VALUE]] +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[EXTENDED]], [[EXTEND]] ], [ [[VALUE]], [[BB:%.*]] ] ; CHECK-NEXT: ret i32 [[RES]] ; bb: diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-math.ll b/llvm/test/Transforms/SimplifyCFG/speculate-math.ll index e3fe5ed2fda786..09512a1aa2a877 100644 --- a/llvm/test/Transforms/SimplifyCFG/speculate-math.ll +++ b/llvm/test/Transforms/SimplifyCFG/speculate-math.ll @@ -1,3 +1,5 @@ +; Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +; Notified per clause 4(b) of the license. ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -simplifycfg < %s | FileCheck %s --check-prefix=ALL --check-prefix=EXPENSIVE ; RUN: opt -S -simplifycfg -speculate-one-expensive-inst=false < %s | FileCheck %s --check-prefix=ALL --check-prefix=CHEAP @@ -12,12 +14,23 @@ declare float @llvm.minimum.f32(float, float) nounwind readonly declare float @llvm.maximum.f32(float, float) nounwind readonly define double @fdiv_test(double %a, double %b) { -; ALL-LABEL: @fdiv_test( -; ALL-NEXT: entry: -; ALL-NEXT: [[CMP:%.*]] = fcmp ogt double [[A:%.*]], 0.000000e+00 -; ALL-NEXT: [[DIV:%.*]] = fdiv double [[B:%.*]], [[A]] -; ALL-NEXT: [[COND:%.*]] = select i1 [[CMP]], double [[DIV]], double 0.000000e+00 -; ALL-NEXT: ret double [[COND]] +; EXPENSIVE-LABEL: @fdiv_test( +; EXPENSIVE-NEXT: entry: +; EXPENSIVE-NEXT: [[CMP:%.*]] = fcmp ogt double [[A:%.*]], 0.000000e+00 +; EXPENSIVE-NEXT: [[DIV:%.*]] = fdiv double [[B:%.*]], [[A]] +; EXPENSIVE-NEXT: [[COND:%.*]] = select i1 [[CMP]], double [[DIV]], double 0.000000e+00 +; EXPENSIVE-NEXT: ret double [[COND]] +; +; CHEAP-LABEL: @fdiv_test( +; CHEAP-NEXT: entry: +; CHEAP-NEXT: [[CMP:%.*]] = fcmp ogt double [[A:%.*]], 0.000000e+00 +; CHEAP-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]] +; CHEAP: cond.true: +; CHEAP-NEXT: [[DIV:%.*]] = fdiv double [[B:%.*]], [[A]] +; CHEAP-NEXT: br label [[COND_END]] +; CHEAP: cond.end: +; CHEAP-NEXT: [[COND:%.*]] = phi double [ [[DIV]], [[COND_TRUE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHEAP-NEXT: ret double [[COND]] ; entry: %cmp = fcmp ogt double %a, 0.0 diff --git a/llvm/test/Verifier/fp-intrinsics.ll b/llvm/test/Verifier/fp-intrinsics.ll index 36e5442bf3cb5e..12a3c9d6582da7 100644 --- a/llvm/test/Verifier/fp-intrinsics.ll +++ b/llvm/test/Verifier/fp-intrinsics.ll @@ -15,64 +15,66 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadat ; CHECK1: attributes #[[ATTR]] = { inaccessiblememonly nounwind willreturn } ; Note: FP exceptions aren't usually caught through normal unwind mechanisms, ; but we may want to revisit this for asynchronous exception handling. -define double @f1(double %a, double %b) { +define double @f1(double %a, double %b) #0 { entry: %fadd = call double @llvm.experimental.constrained.fadd.f64( double %a, double %b, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %fadd } -define double @f1u(double %a) { +define double @f1u(double %a) #0 { entry: %fsqrt = call double @llvm.experimental.constrained.sqrt.f64( double %a, metadata !"round.dynamic", - metadata !"fpexcept.strict") + metadata !"fpexcept.strict") #0 ret double %fsqrt } ; Test an illegal value for the rounding mode argument. ; CHECK2: invalid rounding mode argument -;T2: define double @f2(double %a, double %b) { +;T2: define double @f2(double %a, double %b) #0 { ;T2: entry: ;T2: %fadd = call double @llvm.experimental.constrained.fadd.f64( ;T2: double %a, double %b, ;T2: metadata !"round.dynomite", -;T2: metadata !"fpexcept.strict") +;T2: metadata !"fpexcept.strict") #0 ;T2: ret double %fadd ;T2: } ; Test an illegal value for the exception behavior argument. ; CHECK3: invalid exception behavior argument -;T3: define double @f3(double %a, double %b) { +;T3: define double @f3(double %a, double %b) #0 { ;T3: entry: ;T3: %fadd = call double @llvm.experimental.constrained.fadd.f64( ;T3: double %a, double %b, ;T3: metadata !"round.dynamic", -;T3: metadata !"fpexcept.restrict") +;T3: metadata !"fpexcept.restrict") #0 ;T3: ret double %fadd ;T3: } ; Test an illegal value for the rounding mode argument. ; CHECK4: invalid rounding mode argument -;T4: define double @f4(double %a) { +;T4: define double @f4(double %a) #0 { ;T4: entry: ;T4: %fadd = call double @llvm.experimental.constrained.sqrt.f64( ;T4: double %a, ;T4: metadata !"round.dynomite", -;T4: metadata !"fpexcept.strict") +;T4: metadata !"fpexcept.strict") #0 ;T4: ret double %fadd ;T4: } ; Test an illegal value for the exception behavior argument. ; CHECK5: invalid exception behavior argument -;T5: define double @f5(double %a) { +;T5: define double @f5(double %a) #0 { ;T5: entry: ;T5: %fadd = call double @llvm.experimental.constrained.sqrt.f64( ;T5: double %a, ;T5: metadata !"round.dynamic", -;T5: metadata !"fpexcept.restrict") +;T5: metadata !"fpexcept.restrict") #0 ;T5: ret double %fadd ;T5: } + +attributes #0 = { strictfp } diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 6771b9e7fee8c5..4a2181397ced15 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -124,6 +124,11 @@ def get_asan_rtlib(): opt_viewer_cmd = '%s %s/tools/opt-viewer/opt-viewer.py' % (sys.executable, config.llvm_src_root) +llvm_locstats_tool = os.path.join(config.llvm_tools_dir, 'llvm-locstats') +config.substitutions.append( + ('%llvm-locstats', "'%s' %s" % (config.python_executable, llvm_locstats_tool))) +config.llvm_locstats_used = os.path.exists(llvm_locstats_tool) + tools = [ ToolSubst('%lli', FindTool('lli'), post='.', extra_args=lli_args), ToolSubst('%llc_dwarf', FindTool('llc'), extra_args=llc_args), diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test index 60a1a0a2d10f9c..7e9223c9401994 100644 --- a/llvm/test/tools/dsymutil/cmdline.test +++ b/llvm/test/tools/dsymutil/cmdline.test @@ -1,21 +1,19 @@ RUN: dsymutil -help 2>&1 | FileCheck --check-prefix=HELP %s HELP: OVERVIEW: manipulate archived DWARF debug symbol files. -HELP: USAGE: dsymutil{{[^ ]*}} [options] +HELP: USAGE: {{.*}}dsymutil{{[^ ]*}} [options] HELP-NOT: -reverse-iterate -HELP: Color Options -HELP: -color -HELP: Specific Options: +HELP: Dsymutil Options: HELP: -accelerator -HELP: -arch= +HELP: -arch HELP: -dump-debug-map HELP: -flat HELP: -minimize HELP: -no-odr HELP: -no-output HELP: -no-swiftmodule-timestamp -HELP: -num-threads= -HELP: -o= -HELP: -oso-prepend-path= +HELP: -num-threads +HELP: -oso-prepend-path +HELP: -o HELP: -papertrail HELP: -symbol-map HELP: -symtab diff --git a/llvm/test/tools/llvm-ar/case-detection.test b/llvm/test/tools/llvm-ar/case-detection.test index 998ce67b0a79fc..f49b8ad96e37ac 100644 --- a/llvm/test/tools/llvm-ar/case-detection.test +++ b/llvm/test/tools/llvm-ar/case-detection.test @@ -1,6 +1,6 @@ -- Test CamelCase tool name to ensure detection works properly -REQUIRES: target-windows, system-windows +REQUIRES: system-windows RUN: yaml2obj %S/Inputs/coff.yaml -o %t.obj RUN: rm -rf %t1 RUN: mkdir %t1 diff --git a/llvm/test/tools/llvm-ar/mri-utf8.test b/llvm/test/tools/llvm-ar/mri-utf8.test index 6499996007473d..e297dce8d8aefe 100644 --- a/llvm/test/tools/llvm-ar/mri-utf8.test +++ b/llvm/test/tools/llvm-ar/mri-utf8.test @@ -16,4 +16,8 @@ RUN: cd %t/extracted && llvm-ar x %t/mri.ar # include arguments with non-ascii characters. # Python on Linux defaults to ASCII encoding unless the # environment specifies otherwise, so it is explicitly set. +# The reliance the test has on this locale is not ideal, +# however alternate solutions have been difficult due to +# behaviour differences with python 2 vs python 3, +# and linux vs windows. RUN: env LANG=en_US.UTF-8 %python -c "assert open(u'\U000000A3.txt', 'rb').read() == b'contents\n'" diff --git a/llvm/test/tools/llvm-dwarfdump/X86/locstats.ll b/llvm/test/tools/llvm-dwarfdump/X86/locstats.ll index e88667390ea7b8..a295ad8e402e71 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/locstats.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/locstats.ll @@ -71,7 +71,7 @@ ; CHECK: "vars with 80-89% of its scope covered":1 ; CHECK: "vars with 90-99% of its scope covered":0 ; CHECK: "vars with 100% of its scope covered":1 -; CHECK: "vars (excluding the debug entry values) with 0% of its scope covered":0 +; CHECK: "vars (excluding the debug entry values) with 0% of its scope covered":1 ; CHECK: "vars (excluding the debug entry values) with 1-9% of its scope covered":0 ; CHECK: "vars (excluding the debug entry values) with 10-19% of its scope covered":0 ; CHECK: "vars (excluding the debug entry values) with 20-29% of its scope covered":0 @@ -80,9 +80,9 @@ ; CHECK: "vars (excluding the debug entry values) with 50-59% of its scope covered":1 ; CHECK: "vars (excluding the debug entry values) with 60-69% of its scope covered":0 ; CHECK: "vars (excluding the debug entry values) with 70-79% of its scope covered":0 -; CHECK: "vars (excluding the debug entry values) with 80-89% of its scope covered":0 +; CHECK: "vars (excluding the debug entry values) with 80-89% of its scope covered":1 ; CHECK: "vars (excluding the debug entry values) with 90-99% of its scope covered":0 -; CHECK: "vars (excluding the debug entry values) with 100% of its scope covered":1} +; CHECK: "vars (excluding the debug entry values) with 100% of its scope covered":1 ; ; The source code of the test case: ; extern void fn3(int *); diff --git a/llvm/test/tools/llvm-lib/duplicate.test b/llvm/test/tools/llvm-lib/duplicate.test new file mode 100644 index 00000000000000..3c503ca7f58f25 --- /dev/null +++ b/llvm/test/tools/llvm-lib/duplicate.test @@ -0,0 +1,14 @@ +If the same file is specified more than once as an input file, +llvm-lib should ignore all but the first occurrence of the file. + +RUN: rm -rf %t +RUN: mkdir -p %t + +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/foo.o %S/Inputs/a.s +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/bar.o %S/Inputs/b.s +RUN: llvm-lib -out:%t/foo.lib %t/foo.o %t/foo.o %t/bar.o + +RUN: llvm-ar t %t/foo.lib | FileCheck %s +CHECK: foo.o +CHECK-NOT: foo.o +CHECK: bar.o diff --git a/llvm/test/tools/llvm-lib/invalid.test b/llvm/test/tools/llvm-lib/invalid.test index 2978177a431e22..57266400cdc871 100644 --- a/llvm/test/tools/llvm-lib/invalid.test +++ b/llvm/test/tools/llvm-lib/invalid.test @@ -1,2 +1,2 @@ RUN: not llvm-lib %S/Inputs/cl-gl.obj 2>&1 | FileCheck %s -CHECK: not a COFF object, bitcode or resource file +CHECK: not a COFF object, bitcode, archive or resource file diff --git a/llvm/test/tools/llvm-lib/nest.test b/llvm/test/tools/llvm-lib/nest.test new file mode 100644 index 00000000000000..627c847b133397 --- /dev/null +++ b/llvm/test/tools/llvm-lib/nest.test @@ -0,0 +1,15 @@ +If an archive file is specified as an input file, its members +are added to an output file. This test verifies that beahvior. + +RUN: rm -rf %t +RUN: mkdir -p %t + +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/foo.o %S/Inputs/a.s +RUN: llvm-lib -out:%t/foo.lib %t/foo.o + +RUN: llvm-mc -triple=x86_64-pc-windows-msvc -filetype=obj -o %t/bar.o %S/Inputs/b.s +RUN: llvm-lib -out:%t/bar.lib %t/foo.lib %t/bar.o + +RUN: llvm-ar t %t/bar.lib | FileCheck %s +CHECK: foo.o +CHECK: bar.o diff --git a/llvm/test/tools/llvm-locstats/lit.local.cfg b/llvm/test/tools/llvm-locstats/lit.local.cfg new file mode 100644 index 00000000000000..0f2d477e161bb7 --- /dev/null +++ b/llvm/test/tools/llvm-locstats/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.llvm_locstats_used: + config.unsupported = True diff --git a/llvm/test/tools/llvm-locstats/locstats.ll b/llvm/test/tools/llvm-locstats/locstats.ll new file mode 100644 index 00000000000000..394d5129df7ddd --- /dev/null +++ b/llvm/test/tools/llvm-locstats/locstats.ll @@ -0,0 +1,175 @@ +; UNSUPPORTED: system-windows +; REQUIRES: x86-registered-target +; RUN: llc %s -o %t0.o -filetype=obj +; RUN: %llvm-locstats %t0.o | FileCheck %s --check-prefix=LOCSTATS +; +; Test the llvm-locstats output. +; LOCSTATS: 0% 0 0% +; LOCSTATS: 1-9% 0 0% +; LOCSTATS: 10-19% 0 0% +; LOCSTATS: 20-29% 1 11% +; LOCSTATS: 30-39% 0 0% +; LOCSTATS: 40-49% 1 11% +; LOCSTATS: 50-59% 1 11% +; LOCSTATS: 60-69% 1 11% +; LOCSTATS: 70-79% 0 0% +; LOCSTATS: 80-89% 2 22% +; LOCSTATS: 90-99% 1 11% +; LOCSTATS: 100% 2 22% +; +; The source code of the test case: +;extern int fn2 (int); +; +;__attribute__((noinline)) +;int +;fn1 (int *x, int *y) +;{ +; int a = *x; +; int b = *y; +; int local = a + b; +; if (a > 1) { +; local += 2; +; ++local; +; if (local > 200) +; local -= fn2(a); +; } else { +; local += 3; +; ++local; +; local += fn2(a); +; } +; if (b > 4) +; local += a; +; int local2 = 7; +; local -= fn2 (local2); +; return local; +;} +; +;__attribute__((noinline)) +;int f() +;{ +; int l, k; +; int res = 0; +; res += fn1 (&l, &k); +; return res; +;} +; +; ModuleID = 'locstats.c' +source_filename = "locstats.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @fn1(i32* nocapture readonly %0, i32* nocapture readonly %1) local_unnamed_addr !dbg !7 { + call void @llvm.dbg.value(metadata i32* %0, metadata !13, metadata !DIExpression()), !dbg !19 + call void @llvm.dbg.value(metadata i32* %1, metadata !14, metadata !DIExpression()), !dbg !19 + %3 = load i32, i32* %0, align 4, !dbg !20 + call void @llvm.dbg.value(metadata i32 %3, metadata !15, metadata !DIExpression()), !dbg !19 + %4 = load i32, i32* %1, align 4, !dbg !20 + call void @llvm.dbg.value(metadata i32 %4, metadata !16, metadata !DIExpression()), !dbg !19 + %5 = add nsw i32 %4, %3, !dbg !20 + call void @llvm.dbg.value(metadata i32 %5, metadata !17, metadata !DIExpression()), !dbg !19 + %6 = icmp sgt i32 %3, 1, !dbg !20 + br i1 %6, label %7, label %13, !dbg !22 + +7: ; preds = %2 + call void @llvm.dbg.value(metadata i32 %5, metadata !17, metadata !DIExpression(DW_OP_plus_uconst, 2, DW_OP_stack_value)), !dbg !19 + %8 = add nsw i32 %5, 3, !dbg !23 + call void @llvm.dbg.value(metadata i32 %8, metadata !17, metadata !DIExpression()), !dbg !19 + %9 = icmp sgt i32 %8, 200, !dbg !25 + br i1 %9, label %10, label %17, !dbg !27 + +10: ; preds = %7 + %11 = tail call i32 @fn2(i32 %3), !dbg !27 + %12 = sub nsw i32 %8, %11, !dbg !27 + call void @llvm.dbg.value(metadata i32 %12, metadata !17, metadata !DIExpression()), !dbg !19 + br label %17, !dbg !27 + +13: ; preds = %2 + call void @llvm.dbg.value(metadata i32 %5, metadata !17, metadata !DIExpression(DW_OP_plus_uconst, 3, DW_OP_stack_value)), !dbg !19 + %14 = add nsw i32 %5, 4, !dbg !28 + call void @llvm.dbg.value(metadata i32 %14, metadata !17, metadata !DIExpression()), !dbg !19 + %15 = tail call i32 @fn2(i32 %3), !dbg !30 + %16 = add nsw i32 %14, %15, !dbg !30 + call void @llvm.dbg.value(metadata i32 %16, metadata !17, metadata !DIExpression()), !dbg !19 + br label %17 + +17: ; preds = %7, %10, %13 + %18 = phi i32 [ %12, %10 ], [ %8, %7 ], [ %16, %13 ], !dbg !31 + call void @llvm.dbg.value(metadata i32 %18, metadata !17, metadata !DIExpression()), !dbg !19 + %19 = icmp sgt i32 %4, 4, !dbg !32 + %20 = select i1 %19, i32 %3, i32 0, !dbg !34 + %21 = add nsw i32 %18, %20, !dbg !34 + call void @llvm.dbg.value(metadata i32 %21, metadata !17, metadata !DIExpression()), !dbg !19 + call void @llvm.dbg.value(metadata i32 7, metadata !18, metadata !DIExpression()), !dbg !19 + %22 = tail call i32 @fn2(i32 7), !dbg !34 + %23 = sub i32 %21, %22, !dbg !34 + call void @llvm.dbg.value(metadata i32 %23, metadata !17, metadata !DIExpression()), !dbg !19 + ret i32 %23, !dbg !34 +} + +declare dso_local i32 @fn2(i32) local_unnamed_addr + +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @f() local_unnamed_addr !dbg !35 { + %1 = alloca i32, align 4 + %2 = alloca i32, align 4 + %3 = bitcast i32* %1 to i8*, !dbg !42 + %4 = bitcast i32* %2 to i8*, !dbg !42 + call void @llvm.dbg.value(metadata i32 0, metadata !41, metadata !DIExpression()), !dbg !42 + call void @llvm.dbg.value(metadata i32* %1, metadata !39, metadata !DIExpression(DW_OP_deref)), !dbg !42 + call void @llvm.dbg.value(metadata i32* %2, metadata !40, metadata !DIExpression(DW_OP_deref)), !dbg !42 + %5 = call i32 @fn1(i32* nonnull %1, i32* nonnull %2), !dbg !42 + call void @llvm.dbg.value(metadata i32 %5, metadata !41, metadata !DIExpression()), !dbg !42 + ret i32 %5, !dbg !42 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "locstats.c", directory: "/dir") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 10.0.0"} +!7 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !8, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{!10, !11, !11} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64) +!12 = !{!13, !14, !15, !16, !17, !18} +!13 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 5, type: !11) +!14 = !DILocalVariable(name: "y", arg: 2, scope: !7, file: !1, line: 5, type: !11) +!15 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 7, type: !10) +!16 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 8, type: !10) +!17 = !DILocalVariable(name: "local", scope: !7, file: !1, line: 9, type: !10) +!18 = !DILocalVariable(name: "local2", scope: !7, file: !1, line: 22, type: !10) +!19 = !DILocation(line: 0, scope: !7) +!20 = !DILocation(line: 7, column: 11, scope: !7) +!21 = distinct !DILexicalBlock(scope: !7, file: !1, line: 10, column: 7) +!22 = !DILocation(line: 10, column: 7, scope: !7) +!23 = !DILocation(line: 12, column: 5, scope: !24) +!24 = distinct !DILexicalBlock(scope: !21, file: !1, line: 10, column: 14) +!25 = !DILocation(line: 13, column: 15, scope: !26) +!26 = distinct !DILexicalBlock(scope: !24, file: !1, line: 13, column: 9) +!27 = !DILocation(line: 13, column: 9, scope: !24) +!28 = !DILocation(line: 17, column: 5, scope: !26) +!29 = distinct !DILexicalBlock(scope: !21, file: !1, line: 15, column: 10) +!30 = !DILocation(line: 18, column: 14, scope: !29) +!31 = !DILocation(line: 0, scope: !21) +!32 = !DILocation(line: 20, column: 9, scope: !33) +!33 = distinct !DILexicalBlock(scope: !7, file: !1, line: 20, column: 7) +!34 = !DILocation(line: 20, column: 7, scope: !7) +!35 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 28, type: !36, scopeLine: 29, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !38) +!36 = !DISubroutineType(types: !37) +!37 = !{!10} +!38 = !{!39, !40, !41} +!39 = !DILocalVariable(name: "l", scope: !35, file: !1, line: 30, type: !10) +!40 = !DILocalVariable(name: "k", scope: !35, file: !1, line: 30, type: !10) +!41 = !DILocalVariable(name: "res", scope: !35, file: !1, line: 31, type: !10) +!42 = !DILocation(line: 30, column: 3, scope: !35) diff --git a/llvm/test/tools/llvm-objcopy/ELF/binary-input.test b/llvm/test/tools/llvm-objcopy/ELF/binary-input.test index 8eef7f772f9cea..f232296ded8259 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/binary-input.test +++ b/llvm/test/tools/llvm-objcopy/ELF/binary-input.test @@ -110,3 +110,11 @@ # CHECK-NEXT: Section: Absolute # CHECK-NEXT: } # CHECK-NEXT: ] + +## The alignment can be changed by --set-section-alignment. +# RUN: llvm-objcopy -I binary -O elf64-x86-64 --set-section-alignment .data=8 %t.x-txt %t2.o +# RUN: llvm-readobj --sections %t2.o | FileCheck --check-prefix=ALIGN %s + +# ALIGN: Name: .data +# ALIGN: AddressAlignment: +# ALIGN-SAME: 8{{$}} diff --git a/llvm/test/tools/llvm-objcopy/ELF/set-section-alignment.test b/llvm/test/tools/llvm-objcopy/ELF/set-section-alignment.test new file mode 100644 index 00000000000000..79c7eddffddc31 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/ELF/set-section-alignment.test @@ -0,0 +1,54 @@ +# RUN: yaml2obj %s -o %t + +# RUN: llvm-objcopy --set-section-alignment .foo=4 --set-section-alignment .bar=0x5 \ +# RUN: --set-section-alignment .baz=0 %t %t.2 +# RUN: llvm-readobj --sections %t.2 | FileCheck --check-prefix=CHECK %s + +# CHECK: Name: .foo +# CHECK: AddressAlignment: +# CHECK-SAME: 4{{$}} +# CHECK: Name: .bar +# CHECK: AddressAlignment: +# CHECK-SAME: 5{{$}} +# CHECK: Name: .baz +# CHECK: AddressAlignment: +# CHECK-SAME: 0{{$}} + +## If a section is specified multiple times, the last wins. +# RUN: llvm-objcopy --set-section-alignment .foo=4 --set-section-alignment=.foo=7 %t %t.3 +# RUN: llvm-readobj --sections %t.3 | FileCheck --check-prefix=MULTI %s + +# MULTI: Name: .foo +# MULTI: AddressAlignment: +# MULTI-SAME: 7{{$}} + +## Ignore the option if the section does not exist. +# RUN: llvm-objcopy --set-section-alignment .not_exist=4 %t.3 %t.4 +# RUN: cmp %t.3 %t.4 + +# RUN: not llvm-objcopy --set-section-alignment=.foo %t /dev/null 2>&1 | \ +# RUN: FileCheck --check-prefix=MISSING-EQUAL %s +# MISSING-EQUAL: error: bad format for --set-section-alignment: missing '=' + +# RUN: not llvm-objcopy --set-section-alignment==4 %t /dev/null 2>&1 | \ +# RUN: FileCheck --check-prefix=MISSING-SECTION %s +# MISSING-SECTION: error: bad format for --set-section-alignment: missing section name + +# RUN: not llvm-objcopy --set-section-alignment=.foo=bar %t /dev/null 2>&1 | \ +# RUN: FileCheck --check-prefix=INVALID-ALIGN %s +# INVALID-ALIGN: error: invalid alignment for --set-section-alignment: 'bar' + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .foo + Type: SHT_PROGBITS + - Name: .bar + Type: SHT_NOBITS + - Name: .baz + Type: SHT_NOTE + AddressAlign: 4 diff --git a/llvm/test/tools/llvm-objdump/X86/adjust-vma.test b/llvm/test/tools/llvm-objdump/X86/adjust-vma.test index f28ff36a190b99..61eadaffa7e0c0 100644 --- a/llvm/test/tools/llvm-objdump/X86/adjust-vma.test +++ b/llvm/test/tools/llvm-objdump/X86/adjust-vma.test @@ -3,6 +3,35 @@ # RUN: llvm-objdump --all-headers -D -z --adjust-vma=0x0 %t | FileCheck %s --check-prefixes=COMMON,NOADJUST # RUN: llvm-objdump --all-headers -D -z --adjust-vma=0x123000 %t | FileCheck %s --check-prefixes=COMMON,ADJUST +# NOADJUST: Sections: +# NOADJUST-NEXT: Idx Name Size VMA Type +# NOADJUST-NEXT: 0 00000000 0000000000000000 +# NOADJUST-NEXT: 1 .text 00000002 0000000000000000 TEXT +# NOADJUST-NEXT: 2 .debug_str 00000004 0000000000000000 +# NOADJUST-NEXT: 3 .rela.debug_str 00000018 0000000000000000 +# NOADJUST-NEXT: 4 .data 00000004 0000000000000000 DATA +# NOADJUST-NEXT: 5 .rela.data 00000018 0000000000000000 +# NOADJUST-NEXT: 6 .symtab 00000060 0000000000000000 +# NOADJUST-NEXT: 7 .strtab 00000010 0000000000000000 +# NOADJUST-NEXT: 8 .shstrtab 0000003c 0000000000000000 + +# ADJUST: Sections: +# ADJUST-NEXT: Idx Name Size VMA Type +# ADJUST-NEXT: 0 00000000 0000000000000000 +# ADJUST-NEXT: 1 .text 00000002 0000000000123000 TEXT +# ADJUST-NEXT: 2 .debug_str 00000004 0000000000000000 +# ADJUST-NEXT: 3 .rela.debug_str 00000018 0000000000000000 +# ADJUST-NEXT: 4 .data 00000004 0000000000123000 DATA +# ADJUST-NEXT: 5 .rela.data 00000018 0000000000000000 +# ADJUST-NEXT: 6 .symtab 00000060 0000000000000000 +# ADJUST-NEXT: 7 .strtab 00000010 0000000000000000 +# ADJUST-NEXT: 8 .shstrtab 0000003c 0000000000000000 + +# COMMON: SYMBOL TABLE: +# COMMON-NEXT: 0000000000000001 l F .text 00000000 func +# COMMON-NEXT: 0000000000000000 .text 00000000 sym +# COMMON-NEXT: 0000000000000000 l d .text 00000000 .text + # NOADJUST: 0000000000000000 sym: # NOADJUST-NEXT: 0: {{.*}} nop # NOADJUST: 0000000000000001 func: @@ -41,35 +70,6 @@ # COMMON-NEXT: 0: {{.*}} addb %al, (%rax) ## ... There are more lines here. We do not care. -# NOADJUST: Sections: -# NOADJUST-NEXT: Idx Name Size VMA Type -# NOADJUST-NEXT: 0 00000000 0000000000000000 -# NOADJUST-NEXT: 1 .text 00000002 0000000000000000 TEXT -# NOADJUST-NEXT: 2 .debug_str 00000004 0000000000000000 -# NOADJUST-NEXT: 3 .rela.debug_str 00000018 0000000000000000 -# NOADJUST-NEXT: 4 .data 00000004 0000000000000000 DATA -# NOADJUST-NEXT: 5 .rela.data 00000018 0000000000000000 -# NOADJUST-NEXT: 6 .symtab 00000060 0000000000000000 -# NOADJUST-NEXT: 7 .strtab 00000010 0000000000000000 -# NOADJUST-NEXT: 8 .shstrtab 0000003c 0000000000000000 - -# ADJUST: Sections: -# ADJUST-NEXT: Idx Name Size VMA Type -# ADJUST-NEXT: 0 00000000 0000000000000000 -# ADJUST-NEXT: 1 .text 00000002 0000000000123000 TEXT -# ADJUST-NEXT: 2 .debug_str 00000004 0000000000000000 -# ADJUST-NEXT: 3 .rela.debug_str 00000018 0000000000000000 -# ADJUST-NEXT: 4 .data 00000004 0000000000123000 DATA -# ADJUST-NEXT: 5 .rela.data 00000018 0000000000000000 -# ADJUST-NEXT: 6 .symtab 00000060 0000000000000000 -# ADJUST-NEXT: 7 .strtab 00000010 0000000000000000 -# ADJUST-NEXT: 8 .shstrtab 0000003c 0000000000000000 - -# COMMON: SYMBOL TABLE: -# COMMON-NEXT: 0000000000000001 l F .text 00000000 func -# COMMON-NEXT: 0000000000000000 .text 00000000 sym -# COMMON-NEXT: 0000000000000000 l d .text 00000000 .text - --- !ELF FileHeader: Class: ELFCLASS64 diff --git a/llvm/test/tools/llvm-objdump/X86/demangle.s b/llvm/test/tools/llvm-objdump/X86/demangle.s index 76aacb68995d2e..5bfdbf73e926e5 100644 --- a/llvm/test/tools/llvm-objdump/X86/demangle.s +++ b/llvm/test/tools/llvm-objdump/X86/demangle.s @@ -1,13 +1,13 @@ # RUN: llvm-mc %s -filetype=obj -triple=x86_64-pc-linux -o %t # RUN: llvm-objdump -t -r --demangle %t | FileCheck %s -## Check we demangle symbols when printing relocations. -# CHECK: 000000000000001 R_X86_64_PLT32 foo()-4 - ## Check we demangle symbols when printing symbol table. # CHECK: SYMBOL TABLE: # CHECK-NEXT: 0000000000000000 g F .text 00000000 foo() +## Check we demangle symbols when printing relocations. +# CHECK: 000000000000001 R_X86_64_PLT32 foo()-4 + ## Check the case when relocations are inlined into disassembly. # RUN: llvm-objdump -d -r --demangle %t | FileCheck %s --check-prefix=INLINE # INLINE: foo(): diff --git a/llvm/test/tools/llvm-objdump/X86/out-of-section-sym.test b/llvm/test/tools/llvm-objdump/X86/out-of-section-sym.test index 4da81c63ca3de0..d79a05550f0473 100644 --- a/llvm/test/tools/llvm-objdump/X86/out-of-section-sym.test +++ b/llvm/test/tools/llvm-objdump/X86/out-of-section-sym.test @@ -5,12 +5,13 @@ // RUN: cmp %t0 %t1 // RUN: FileCheck --input-file %t0 %s -CHECK: Disassembly of section .text: -CHECK-EMPTY: -CHECK-NEXT: _start: -CHECK-NEXT: 10: c3 retl -CHECK-NEXT: SYMBOL TABLE: +CHECK: SYMBOL TABLE: CHECK-NEXT: 00000010 l d .text 00000000 .text CHECK-NEXT: 00000010 .text 00000000 _start CHECK-NEXT: 00000020 .text 00000000 _fdata CHECK-NEXT: 00000010 .text 00000000 _ftext +CHECK-EMPTY: +CHECK-NEXT: Disassembly of section .text: +CHECK-EMPTY: +CHECK-NEXT: _start: +CHECK-NEXT: 10: c3 retl diff --git a/llvm/test/tools/llvm-objdump/X86/output-ordering.test b/llvm/test/tools/llvm-objdump/X86/output-ordering.test new file mode 100644 index 00000000000000..0d629af91e007d --- /dev/null +++ b/llvm/test/tools/llvm-objdump/X86/output-ordering.test @@ -0,0 +1,70 @@ +# RUN: yaml2obj %s > %t.o +# RUN: llvm-objdump --file-headers --private-headers --section-headers --syms \ +# RUN: --full-contents --dwarf=frames \ +# RUN: --reloc %t.o | FileCheck %s --check-prefixes=CHECK,RELOC +# RUN: llvm-objdump --file-headers --private-headers --section-headers --syms \ +# RUN: --full-contents --dwarf=frames \ +# RUN: --disassemble %t.o | FileCheck %s --check-prefixes=CHECK,DISASM + +## Test the ordering of most of the output. Note that --disassemble suppresses +## --reloc, so we test them independently. + +## File headers (-f) +# CHECK: file format ELF64-x86-64 +# CHECK: architecture: x86_64 +# CHECK: start address: 0x0000000000000000 +## Private headers (-p) +# CHECK: Program Header: +# CHECK: Dynamic Section: +## Section headers (-h) +# CHECK: Sections: +## Symbol table (-t) +# CHECK: SYMBOL TABLE: +## DWARF contents: +# CHECK: .eh_frame contents: +## Relocations (-r) +# RELOC: RELOCATION RECORDS FOR [.text]: +## Section contents (-s) +# CHECK: Contents of section .rel.text: +# CHECK: Contents of section .dynamic: +# CHECK: Contents of section .symtab: +# CHECK: Contents of section .strtab: +# CHECK: Contents of section .shstrtab: +## Disassembly (-d) +# DISASM: Disassembly of section .text: + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Content: e800000000e800000000 + - Name: .rel.text + Type: SHT_REL + Info: .text + Relocations: + - Offset: 0x1 + Symbol: foo + Type: R_X86_64_32 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + EntSize: 0x0000000000000010 + Entries: + - Tag: DT_INIT + Value: 0x00000000000006A0 + - Tag: DT_NULL + Value: 0x0000000000000000 + - Name: .eh_frame + Type: SHT_X86_64_UNWIND + Flags: [ SHF_ALLOC ] + AddressAlign: 0x0000000000000001 + Content: 00000000 +Symbols: + - Name: foo + Section: .text diff --git a/llvm/test/tools/llvm-objdump/all-headers.test b/llvm/test/tools/llvm-objdump/all-headers.test index ea45eccee35657..7e57eba3b016d2 100644 --- a/llvm/test/tools/llvm-objdump/all-headers.test +++ b/llvm/test/tools/llvm-objdump/all-headers.test @@ -1,8 +1,9 @@ # RUN: yaml2obj %s > %t # RUN: llvm-objdump --all-headers %t | FileCheck %s -## Check we print file format, architecture and start address followed by the -## other data when using --all-headers. +## Note: --all-headers (-x) is an alias for --archive-headers --file-headers +## --section-headers --private-headers --reloc --syms (-afhprt). Test for them +## in the following order which provides GNU objdump compatability. # CHECK: file format ELF64-x86-64 # CHECK-EMPTY: @@ -13,6 +14,7 @@ # CHECK: Dynamic Section: # CHECK: Sections: # CHECK: SYMBOL TABLE: +# CHECK: RELOCATION RECORDS FOR [.text]: ## Check how we dump the archives. ## Check we dump the appropriate headers for each file in the archive. @@ -23,26 +25,49 @@ # RUN: llvm-objdump --all-headers %t.a | FileCheck %s --check-prefix=ARCHIVE # ARCHIVE: {{.*}}.a({{.*}}): file format ELF64-x86-64 +# ARCHIVE: rw-r--r-- # ARCHIVE: architecture: x86_64 # ARCHIVE: start address: 0x0000000000000000 -# ARCHIVE: rw-r--r-- # ARCHIVE: Program Header: # ARCHIVE: Dynamic Section: # ARCHIVE: Sections: # ARCHIVE: SYMBOL TABLE: +# ARCHIVE: RELOCATION RECORDS FOR [.text]: # ARCHIVE: {{.*}}.a({{.*}}2): file format ELF64-x86-64 +# ARCHIVE: rw-r--r-- # ARCHIVE: architecture: x86_64 # ARCHIVE: start address: 0x0000000000000000 -# ARCHIVE: rw-r--r-- # ARCHIVE: Program Header: # ARCHIVE: Dynamic Section: # ARCHIVE: Sections: # ARCHIVE: SYMBOL TABLE: +# ARCHIVE: RELOCATION RECORDS FOR [.text]: !ELF FileHeader: Class: ELFCLASS64 Data: ELFDATA2LSB - Type: ET_EXEC + Type: ET_REL Machine: EM_X86_64 Sections: + - Name: .text + Type: SHT_PROGBITS + - Name: .rel.text + Type: SHT_REL + Info: .text + Relocations: + - Offset: 0x1 + Symbol: foo + Type: R_X86_64_32 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + EntSize: 0x0000000000000010 + Entries: + - Tag: DT_INIT + Value: 0x00000000000006A0 + - Tag: DT_NULL + Value: 0x0000000000000000 +Symbols: + - Name: foo + Section: .text diff --git a/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test b/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test new file mode 100644 index 00000000000000..66b0543d7a416e --- /dev/null +++ b/llvm/test/tools/llvm-profdata/profile-symbol-list-compress.test @@ -0,0 +1,6 @@ +REQUIRES: zlib +; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -prof-sym-list=%S/Inputs/profile-symbol-list-1.text %S/Inputs/sample-profile.proftext -o %t.1.output +; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections -prof-sym-list=%S/Inputs/profile-symbol-list-2.text %S/Inputs/sample-profile.proftext -o %t.2.output +; RUN: llvm-profdata merge -sample -extbinary -compress-all-sections %t.1.output %t.2.output -o %t.3.output +; RUN: llvm-profdata show -sample -show-prof-sym-list %t.3.output > %t.4.output +; RUN: diff %S/Inputs/profile-symbol-list.expected %t.4.output diff --git a/llvm/test/tools/llvm-profdata/roundtrip-compress.test b/llvm/test/tools/llvm-profdata/roundtrip-compress.test new file mode 100644 index 00000000000000..7e495b6d95128c --- /dev/null +++ b/llvm/test/tools/llvm-profdata/roundtrip-compress.test @@ -0,0 +1,10 @@ +REQUIRES: zlib +# Round trip from text --> compressed extbinary --> text +RUN: llvm-profdata merge --sample --extbinary -compress-all-sections -output=%t.1.profdata %S/Inputs/sample-profile.proftext +RUN: llvm-profdata merge --sample --text -output=%t.1.proftext %t.1.profdata +RUN: diff %t.1.proftext %S/Inputs/sample-profile.proftext +# Round trip from text --> binary --> compressed extbinary --> text +RUN: llvm-profdata merge --sample --binary -output=%t.2.profdata %S/Inputs/sample-profile.proftext +RUN: llvm-profdata merge --sample --extbinary -compress-all-sections -output=%t.3.profdata %t.2.profdata +RUN: llvm-profdata merge --sample --text -output=%t.2.proftext %t.3.profdata +RUN: diff %t.2.proftext %S/Inputs/sample-profile.proftext diff --git a/llvm/test/tools/llvm-readobj/all.test b/llvm/test/tools/llvm-readobj/all.test index 17c5a007adfa57..ac27f38c3a3276 100644 --- a/llvm/test/tools/llvm-readobj/all.test +++ b/llvm/test/tools/llvm-readobj/all.test @@ -1,16 +1,26 @@ -RUN: llvm-readobj -a %p/Inputs/trivial.obj.elf-i386 \ -RUN: | FileCheck %s -check-prefix ALL -RUN: llvm-readobj --all %p/Inputs/trivial.obj.elf-i386 \ -RUN: | FileCheck %s -check-prefix ALL +# RUN: yaml2obj %s -o %t.o +# RUN: llvm-readobj -a %t.o | FileCheck %s --check-prefix ALL +# RUN: llvm-readobj --all %t.o | FileCheck %s --check-prefix ALL -ALL: Format: ELF32-i386 -ALL: Arch: i386 -ALL: AddressSize: 32bit -ALL: LoadName: -ALL: ElfHeader { -ALL: Sections [ -ALL: Relocations [ -ALL: Symbols [ -ALL: ProgramHeaders [ -ALL: Notes [ -ALL: StackSizes [ +# ALL: Format: ELF32-i386 +# ALL: Arch: i386 +# ALL: AddressSize: 32bit +# ALL: LoadName: +# ALL: ElfHeader { +# ALL: Sections [ +# ALL: Relocations [ +# ALL: Symbols [ +# ALL: ProgramHeaders [ +# ALL: Version symbols { +# ALL: SHT_GNU_verdef { +# ALL: SHT_GNU_verneed { +# ALL: Addrsig [ +# ALL: Notes [ +# ALL: StackSizes [ + +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_386 diff --git a/llvm/test/tools/llvm-readobj/elf-addrsig.test b/llvm/test/tools/llvm-readobj/elf-addrsig.test new file mode 100644 index 00000000000000..a0c32ab593659c --- /dev/null +++ b/llvm/test/tools/llvm-readobj/elf-addrsig.test @@ -0,0 +1,84 @@ +## Show that llvm-readobj can dump SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=1 %s -o %t1.o +# RUN: llvm-readobj --addrsig %t1.o | FileCheck -DFILE=%t1.o %s --check-prefix LLVM +# RUN: not llvm-readelf --addrsig %t1.o 2>&1 | FileCheck -DFILE=%t1.o %s --check-prefix GNU + +# LLVM: Addrsig [ +# LLVM-NEXT: Sym: foo (1) +# LLVM-NEXT: Sym: bar (2) +# LLVM-NEXT: ] + +# GNU: error: '[[FILE]]': --addrsig: not implemented + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar +Symbols: + - Name: foo + - Name: bar + +## Check that llvm-readobj dumps any SHT_LLVM_ADDRSIG section when --all +## is specified for LLVM style, but not for GNU style. +## TODO: Refine the llvm-readelf check when GNU-style dumping is implemented. + +# RUN: llvm-readobj --all %t1.o | FileCheck %s --check-prefix LLVM +# RUN: llvm-readelf --all %t1.o 2>&1 | FileCheck %s --implicit-check-not=warning --implicit-check-not=error + +## Check we report a warning when SHT_LLVM_ADDRSIG is broken (e.g. contains a malformed uleb128). + +# RUN: yaml2obj --docnum=2 %s -o %t2.o +# RUN: llvm-readobj --addrsig %t2.o 2>&1 | FileCheck %s -DFILE=%t2.o --check-prefix=MALFORMED + +# MALFORMED: warning: '[[FILE]]': malformed uleb128, extends past end + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "FF" + +## Check we report a warning when SHT_LLVM_ADDRSIG references a symbol that can't be +## dumped (e.g. the index value is larger than the number of symbols in .symtab). + +# RUN: yaml2obj --docnum=3 %s -o %t3.o +# RUN: llvm-readobj --addrsig %t3.o 2>&1 | FileCheck %s -DFILE=%t3.o --check-prefix=INVALID-INDEX + +# INVALID-INDEX: Addrsig [ +# INVALID-INDEX-NEXT: Sym: foo (1) +# INVALID-INDEX-EMPTY: +# INVALID-INDEX-NEXT: warning: '[[FILE]]': unable to get symbol from section [index 2]: invalid symbol index (255) +# INVALID-INDEX-NEXT: Sym: (255) +# INVALID-INDEX-NEXT: Sym: bar (2) +# INVALID-INDEX-NEXT: ] + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 1 + - Index: 255 + - Index: 2 +Symbols: + - Name: foo + - Name: bar diff --git a/llvm/test/tools/llvm-readobj/elf-hash-symbols.test b/llvm/test/tools/llvm-readobj/elf-hash-symbols.test index b0140a2e9c1cee..4ffecf9fcc6d4b 100644 --- a/llvm/test/tools/llvm-readobj/elf-hash-symbols.test +++ b/llvm/test/tools/llvm-readobj/elf-hash-symbols.test @@ -361,3 +361,43 @@ ProgramHeaders: PAddr: 0x1000 Sections: - Section: .dynamic + +## Show that we report a warning for a hash table which contains an entry of +## the bucket array pointing to a cycle. + +# RUN: yaml2obj --docnum=6 %s -o %t6.so +# RUN: llvm-readelf --hash-symbols %t6.so 2>&1 | FileCheck %s -DFILE=%t6.so --check-prefix=BROKEN + +# BROKEN: Symbol table of .hash for image: +# BROKEN-NEXT: Num Buc: Value Size Type Bind Vis Ndx Name +# BROKEN-NEXT: 1 0: 00000000 0 NOTYPE LOCAL DEFAULT UND aaa +# BROKEN: warning: '[[FILE]]': .hash section is invalid: bucket 1: a cycle was detected in the linked chain + +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_386 +Sections: + - Name: .hash + Type: SHT_HASH + Link: .dynsym + Bucket: [ 1 ] + Chain: [ 1, 1 ] + - Name: .dynamic + Type: SHT_DYNAMIC + Entries: +## llvm-readelf will read the hash table from the file offset +## p_offset + (p_vaddr - DT_HASH) = p_offset + (0 - 0) = p_offset, +## which is the start of PT_LOAD, i.e. the file offset of .hash. + - Tag: DT_HASH + Value: 0x0 +DynamicSymbols: + - Name: aaa + - Name: bbb +ProgramHeaders: + - Type: PT_LOAD + Sections: + - Section: .hash + - Section: .dynamic diff --git a/llvm/test/tools/llvm-readobj/elf-section-types.test b/llvm/test/tools/llvm-readobj/elf-section-types.test index aad9f43c8a3015..20b881249c7f57 100644 --- a/llvm/test/tools/llvm-readobj/elf-section-types.test +++ b/llvm/test/tools/llvm-readobj/elf-section-types.test @@ -196,6 +196,7 @@ Sections: Type: SHT_LLVM_CALL_GRAPH_PROFILE - Name: llvm_addrsig Type: SHT_LLVM_ADDRSIG + Symbols: - Name: .deplibs Type: SHT_LLVM_DEPENDENT_LIBRARIES - Name: .llvm_sympart.f diff --git a/llvm/test/tools/llvm-readobj/mips-abiflags.test b/llvm/test/tools/llvm-readobj/mips-abiflags.test index c06d147397ebc9..54797bfa76f62a 100644 --- a/llvm/test/tools/llvm-readobj/mips-abiflags.test +++ b/llvm/test/tools/llvm-readobj/mips-abiflags.test @@ -1,8 +1,13 @@ -RUN: llvm-readobj --mips-abi-flags %p/Inputs/abiflags.obj.elf-mipsel | \ +RUN: llvm-readobj -A %p/Inputs/abiflags.obj.elf-mipsel | \ RUN: FileCheck -check-prefix=EL64 %s -RUN: llvm-readobj --mips-abi-flags %p/Inputs/abiflags.obj.elf-mips | \ +RUN: llvm-readobj -A %p/Inputs/abiflags.obj.elf-mips | \ RUN: FileCheck -check-prefix=BE32 %s +RUN: llvm-readelf -A %p/Inputs/abiflags.obj.elf-mipsel | \ +RUN: FileCheck -check-prefix=GNU-EL64 %s +RUN: llvm-readelf -A %p/Inputs/abiflags.obj.elf-mips | \ +RUN: FileCheck -check-prefix=GNU-BE32 %s + EL64: MIPS ABI Flags { EL64-NEXT: Version: 0 EL64-NEXT: ISA: MIPS64r5 @@ -40,3 +45,27 @@ BE32-NEXT: ODDSPREG (0x1) BE32-NEXT: ] BE32-NEXT: Flags 2: 0x0 BE32-NEXT: } + +GNU-EL64: MIPS ABI Flags Version: 0 +GNU-EL64-EMPTY: +GNU-EL64-NEXT: ISA: MIPS64r5 +GNU-EL64-NEXT: GPR size: 64 +GNU-EL64-NEXT: CPR1 size: 64 +GNU-EL64-NEXT: CPR2 size: 0 +GNU-EL64-NEXT: FP ABI: Hard float (double precision) +GNU-EL64-NEXT: ISA Extension: Cavium Networks Octeon3 +GNU-EL64-NEXT: ASEs: DSP, DSPR2, VZ +GNU-EL64-NEXT: FLAGS 1: 00000001 +GNU-EL64-NEXT: FLAGS 2: 00000000 + +GNU-BE32: MIPS ABI Flags Version: 0 +GNU-BE32-EMPTY: +GNU-BE32-NEXT: ISA: MIPS32r2 +GNU-BE32-NEXT: GPR size: 32 +GNU-BE32-NEXT: CPR1 size: 0 +GNU-BE32-NEXT: CPR2 size: 0 +GNU-BE32-NEXT: FP ABI: Soft float +GNU-BE32-NEXT: ISA Extension: None +GNU-BE32-NEXT: ASEs: DSP, DSPR2, microMIPS +GNU-BE32-NEXT: FLAGS 1: 00000001 +GNU-BE32-NEXT: FLAGS 2: 00000000 diff --git a/llvm/test/tools/llvm-readobj/mips-got-overlapped.test b/llvm/test/tools/llvm-readobj/mips-got-overlapped.test index 85c4fe2d67c18f..881c63b79a4f93 100644 --- a/llvm/test/tools/llvm-readobj/mips-got-overlapped.test +++ b/llvm/test/tools/llvm-readobj/mips-got-overlapped.test @@ -1,9 +1,9 @@ -# Check that llvm-readobj --mips-plt-got correctly shows .got section +# Check that llvm-readobj -A correctly shows .got section # content if there are some other zero-sized sections with the same # address as the .got. got-over.exe.elf-mips has zero-sized .data # section at the same offset .got section. -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-over.exe.elf-mips | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/got-over.exe.elf-mips | FileCheck %s GOT-OBJ: Cannot find PLTGOT dynamic table tag. diff --git a/llvm/test/tools/llvm-readobj/mips-got.test b/llvm/test/tools/llvm-readobj/mips-got.test index e6e21ad6aca2d9..8ed35d4b68e27d 100644 --- a/llvm/test/tools/llvm-readobj/mips-got.test +++ b/llvm/test/tools/llvm-readobj/mips-got.test @@ -1,31 +1,25 @@ -RUN: not llvm-readobj --mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \ -RUN: FileCheck %s -DFILE=%p/Inputs/relocs.obj.elf-mips -check-prefix GOT-OBJ -RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ +RUN: llvm-readobj -A %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s -check-prefix GOT-EXE -RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ +RUN: llvm-readobj -A %p/Inputs/dynamic-table-so.mips | \ RUN: FileCheck %s -check-prefix GOT-SO -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \ +RUN: llvm-readobj -A %p/Inputs/got-tls.so.elf-mips64el | \ RUN: FileCheck %s -check-prefix GOT-TLS -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-empty.exe.mipsel | \ +RUN: llvm-readobj -A %p/Inputs/got-empty.exe.mipsel | \ RUN: FileCheck %s -check-prefix GOT-EMPTY -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-static.exe.mips | \ +RUN: llvm-readobj -A %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s -check-prefix GOT-STATIC -RUN: not llvm-readelf --mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \ -RUN: FileCheck %s -DFILE=%p/Inputs/relocs.obj.elf-mips -check-prefix GNU-GOT-OBJ -RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ +RUN: llvm-readelf -A %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EXE -RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ +RUN: llvm-readelf -A %p/Inputs/dynamic-table-so.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-SO -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \ +RUN: llvm-readelf -A %p/Inputs/got-tls.so.elf-mips64el | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-TLS -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-empty.exe.mipsel | \ +RUN: llvm-readelf -A %p/Inputs/got-empty.exe.mipsel | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EMPTY -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-static.exe.mips | \ +RUN: llvm-readelf -A %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-STATIC -GOT-OBJ: error: '[[FILE]]': Cannot find .got section - GOT-EXE: Primary GOT { GOT-EXE-NEXT: Canonical gp value: 0x418880 GOT-EXE-NEXT: Reserved entries [ @@ -380,8 +374,6 @@ GOT-STATIC-NEXT: } GOT-STATIC-NEXT: ] GOT-STATIC-NEXT: } -GNU-GOT-OBJ: error: '[[FILE]]': Cannot find .got section - GNU-GOT-EXE: Primary GOT: GNU-GOT-EXE-NEXT: Canonical gp value: 00418880 diff --git a/llvm/test/tools/llvm-readobj/mips-options-sec.test b/llvm/test/tools/llvm-readobj/mips-options-sec.test index 64b3f0e91795f2..3636d56cfe6e24 100644 --- a/llvm/test/tools/llvm-readobj/mips-options-sec.test +++ b/llvm/test/tools/llvm-readobj/mips-options-sec.test @@ -1,4 +1,4 @@ -RUN: llvm-readobj --mips-options %p/Inputs/options.obj.elf-mipsel | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/options.obj.elf-mipsel | FileCheck %s CHECK: MIPS Options { CHECK-NEXT: ODK_REGINFO { diff --git a/llvm/test/tools/llvm-readobj/mips-plt.test b/llvm/test/tools/llvm-readobj/mips-plt.test index b130a67d0443fc..4e40ca6aa2c135 100644 --- a/llvm/test/tools/llvm-readobj/mips-plt.test +++ b/llvm/test/tools/llvm-readobj/mips-plt.test @@ -1,5 +1,5 @@ -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck --check-prefix=GNU %s +RUN: llvm-readobj -A %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s +RUN: llvm-readelf -A %p/Inputs/got-plt.exe.elf-mipsel | FileCheck --check-prefix=GNU %s CHECK: PLT GOT { CHECK-NEXT: Reserved entries [ diff --git a/llvm/test/tools/llvm-readobj/mips-reginfo.test b/llvm/test/tools/llvm-readobj/mips-reginfo.test index 7571d4c56bf078..20177a99d8cb74 100644 --- a/llvm/test/tools/llvm-readobj/mips-reginfo.test +++ b/llvm/test/tools/llvm-readobj/mips-reginfo.test @@ -1,4 +1,4 @@ -RUN: llvm-readobj --mips-reginfo %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s CHECK: MIPS RegInfo { CHECK-NEXT: GP: 0x7FEF diff --git a/llvm/test/tools/llvm-symbolizer/coff-dwarf.test b/llvm/test/tools/llvm-symbolizer/coff-dwarf.test index 790763a2ddf073..03b5d9147fe2e9 100644 --- a/llvm/test/tools/llvm-symbolizer/coff-dwarf.test +++ b/llvm/test/tools/llvm-symbolizer/coff-dwarf.test @@ -6,7 +6,7 @@ RUN: llvm-symbolizer 0x5009 0x5038 -i --relative-address -obj="%p/Inputs/coff-dw RUN: | FileCheck %s This test relies on UnDecorateSymbolName, which is Windows-only. -REQUIRES: target-windows, system-windows +REQUIRES: system-windows CHECK: foo(void) CHECK: coff-dwarf.cpp:7 diff --git a/llvm/test/tools/llvm-symbolizer/coff-exports.test b/llvm/test/tools/llvm-symbolizer/coff-exports.test index 8678aae2960e94..255681178f4522 100644 --- a/llvm/test/tools/llvm-symbolizer/coff-exports.test +++ b/llvm/test/tools/llvm-symbolizer/coff-exports.test @@ -7,8 +7,6 @@ RUN: | FileCheck %s This test relies on UnDecorateSymbolName, which is Win32-only. REQUIRES: system-windows -REQUIRES: target-windows -FIXME: This test depends on host, not target. We get the expected stack trace, except 'foo' appears for the 'bar' frame because 'bar' isn't in the export table. diff --git a/llvm/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml b/llvm/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml new file mode 100644 index 00000000000000..6f21c3212bd9f2 --- /dev/null +++ b/llvm/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml @@ -0,0 +1,98 @@ +## Check how obj2yaml dumps the SHT_LLVM_ADDRSIG section. + +## Check that when possible obj2yaml tries to produce the "Name" tag when +## dumping entries of the SHT_LLVM_ADDRSIG section. It falls back to producing +## the "Index" tag when it can't match a symbol index with a symbol table entry. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=NAME + +# NAME: - Name: .llvm_addrsig +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Link: .symtab +# NAME-NEXT: Symbols: +# NAME-NEXT: - Name: foo +# NAME-NEXT: - Name: bar +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF +# NAME: - Name: .llvm_addrsig_unlinked +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Symbols: +# NAME-NEXT: - Index: 0x00000001 +# NAME-NEXT: - Index: 0x00000002 +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF + - Name: .llvm_addrsig_unlinked + Type: SHT_LLVM_ADDRSIG + Link: 0 + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check that obj2yaml dumps the SHT_LLVM_ADDRSIG section +## data using the "Content" tag when at least one of the entries is broken, +## e.g. because the entry contains a malformed uleb128 value. + +# RUN: yaml2obj --docnum=2 %s -o %t2 +# RUN: obj2yaml %t2 | FileCheck %s --check-prefix=INVALID-ENTRY + +# INVALID-ENTRY: - Name: .llvm_addrsig +# INVALID-ENTRY-NEXT: Type: SHT_LLVM_ADDRSIG +# INVALID-ENTRY-NEXT: Link: .symtab +# INVALID-ENTRY-NEXT: Content: FFFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "FFFFFFFFFF" + +## obj2yaml produces a "Symbols" tag when describing an empty SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=3 %s -o %t3 +# RUN: obj2yaml %t3 | FileCheck %s --check-prefix=EMPTY + +# EMPTY: - Name: .llvm_addrsig +# EMPTY-NEXT: Type: SHT_LLVM_ADDRSIG +# EMPTY-NEXT: Link: .symtab +# EMPTY-NEXT: Symbols: [] + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" diff --git a/llvm/test/tools/yaml2obj/elf-hash-section.yaml b/llvm/test/tools/yaml2obj/elf-hash-section.yaml index 995c871439a7a2..4aad9c11fd592d 100644 --- a/llvm/test/tools/yaml2obj/elf-hash-section.yaml +++ b/llvm/test/tools/yaml2obj/elf-hash-section.yaml @@ -66,7 +66,7 @@ Sections: # RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-BUCKET -# CONTENT-BUCKET: error: "Content" and "Bucket" cannot be used together +# CONTENT-BUCKET: "Bucket" cannot be used with "Content" or "Size" --- !ELF FileHeader: @@ -84,7 +84,7 @@ Sections: # RUN: not yaml2obj --docnum=4 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-CHAIN -# CONTENT-CHAIN: error: "Content" and "Chain" cannot be used together +# CONTENT-CHAIN: "Chain" cannot be used with "Content" or "Size" --- !ELF FileHeader: @@ -134,7 +134,7 @@ Sections: # RUN: not yaml2obj --docnum=7 %s 2>&1 | FileCheck %s --check-prefix=NO-TAGS -# NO-TAGS: error: one of "Content", "Bucket" or "Chain" must be specified +# NO-TAGS: error: one of "Content", "Size", "Bucket" or "Chain" must be specified --- !ELF FileHeader: @@ -177,3 +177,102 @@ Sections: ## SHT_HASH is linked to dynamic symbol table by default if it exists. - Name: .dynsym Type: SHT_DYNSYM + +## Check we can use only "Size" to create a SHT_HASH section. + +# RUN: yaml2obj --docnum=9 %s -o %t9 +# RUN: llvm-readobj --sections --section-data %t9 | FileCheck %s --check-prefix=SIZE + +# SIZE: Name: .hash +# SIZE: Size: +# SIZE-SAME: 17 +# SIZE: SectionData ( +# SIZE-NEXT: 0000: 00000000 00000000 00000000 00000000 | +# SIZE-NEXT: 0010: 00 | +# SIZE-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .hash + Type: SHT_HASH + Size: 0x11 + +## Check we can use "Size" and "Content" together to create a SHT_HASH section. + +# RUN: yaml2obj --docnum=10 %s -o %t10 +# RUN: llvm-readobj --sections --section-data %t10 | FileCheck %s --check-prefix=SIZE-CONTENT + +# SIZE-CONTENT: Name: .hash +# SIZE-CONTENT: Size: +# SIZE-CONTENT-SAME: 5 +# SIZE-CONTENT: SectionData ( +# SIZE-CONTENT-NEXT: 0000: 11223300 00 | +# SIZE-CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .hash + Type: SHT_HASH + Size: 0x5 + Content: "112233" + +## Check that when "Size" and "Content" are used together, the size +## must be greater than or equal to the content size. + +# RUN: not yaml2obj --docnum=11 %s 2>&1 | FileCheck %s --check-prefix=SIZE-CONTENT-ERR + +# SIZE-CONTENT-ERR: error: "Size" must be greater than or equal to the content size + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .hash + Type: SHT_HASH + Size: 0x1 + Content: "1122" + +## Check we can't use "Size" and "Bucket" tags together. + +# RUN: not yaml2obj --docnum=12 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-BUCKET + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .hash + Type: SHT_HASH + Size: 0x1 + Bucket: [ 1 ] + +## Check we can't use "Size" and "Chain" tags together. + +# RUN: not yaml2obj --docnum=13 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-CHAIN + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .hash + Type: SHT_HASH + Size: 0x1 + Chain: [ 1 ] diff --git a/llvm/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml b/llvm/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml new file mode 100644 index 00000000000000..1433d6dbc13e7e --- /dev/null +++ b/llvm/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml @@ -0,0 +1,307 @@ +## Check how yaml2obj produces SHT_LLVM_ADDRSIG sections. + +## Check we can describe SHT_LLVM_ADDRSIG using the "Symbols" tag. We can define +## symbols either using names or indexes. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: llvm-readobj --sections --section-data %t1 | FileCheck %s --check-prefix=SYMBOLS + +# SYMBOLS: Section { +# SYMBOLS: Index: 1 +# SYMBOLS-NEXT: Name: .llvm_addrsig +# SYMBOLS-NEXT: Type: SHT_LLVM_ADDRSIG +# SYMBOLS-NEXT: Flags [ +# SYMBOLS-NEXT: ] +# SYMBOLS-NEXT: Address: 0x0 +# SYMBOLS-NEXT: Offset: 0x40 +# SYMBOLS-NEXT: Size: 4 +# SYMBOLS-NEXT: Link: 2 +# SYMBOLS-NEXT: Info: 0 +# SYMBOLS-NEXT: AddressAlignment: 0 +# SYMBOLS-NEXT: EntrySize: 0 +# SYMBOLS-NEXT: SectionData ( +# SYMBOLS-NEXT: 0000: 01020102 +# SYMBOLS-NEXT: ) +# SYMBOLS-NEXT: } + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + - Index: 1 + - Index: 2 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## We can't specify both "Index" and "Name" when defining a symbol. + +# RUN: not yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --check-prefix=INDEX-NAME + +# INDEX-NAME: error: "Index" and "Name" cannot be used together when defining a symbol + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + Index: 1 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check we report an error if an unknown symbol is referenced in the +## SHT_LLVM_ADDRSIG section description. + +# RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-UNKNOWN + +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'foo' by YAML section '.llvm_addrsig' +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'bar' by YAML section '.llvm_addrsig' + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + +## Check we can specify any arbitrary symbol indices. + +# RUN: yaml2obj --docnum=4 %s -o %t4 +# RUN: llvm-readobj --sections --section-data %t4 | FileCheck %s --check-prefix=SYMBOL-INDEX + +# SYMBOL-INDEX: Type: SHT_LLVM_ADDRSIG +# SYMBOL-INDEX: SectionData ( +# SYMBOL-INDEX-NEXT: 0000: 00FF01C4 E6888901 FFFFFFFF 0F +# SYMBOL-INDEX-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0 + - Index: 255 + - Index: 0x11223344 +## 0xFFFFFFFF is a maximum allowed index value. + - Index: 0xFFFFFFFF + +## Check that the maximum symbol index size is 32 bits. + +# RUN: not yaml2obj --docnum=5 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-INDEX-OVERFLOW + +# SYMBOL-INDEX-OVERFLOW: error: out of range hex32 number + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0x1122334455 + +## Check we can use the "Content" tag to specify any data for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=6 %s -o %t6 +# RUN: llvm-readobj --sections --section-data %t6 | FileCheck %s --check-prefix=CONTENT + +# CONTENT: Type: SHT_LLVM_ADDRSIG +# CONTENT: Size: +# CONTENT-SAME: 5 +# CONTENT: SectionData ( +# CONTENT-NEXT: 0000: 11223344 55 +# CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "1122334455" + +## Either "Content" or "Symbols" must be specifed for SHT_LLVM_ADDRSIG sections. + +# RUN: not yaml2obj --docnum=7 %s 2>&1 | FileCheck %s --check-prefix=NO-TAGS + +# NO-TAGS: error: one of "Content", "Size" or "Symbols" must be specified + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + +## "Content" and "Symbols" cannot be used together to describe the SHT_LLVM_ADDRSIG section. + +# RUN: not yaml2obj --docnum=8 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS + +# CONTENT-SYMBOLS: "Symbols" cannot be used with "Content" or "Size" + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" + Symbols: + +## Check we can set an arbitrary sh_link value for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=9 %s -o %t9 +# RUN: llvm-readobj --sections %t9 | FileCheck %s --check-prefix=LINK + +# LINK: Name: .llvm_addrsig +# LINK: Link: +# LINK-SAME: 123{{$}} + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Link: 123 + Content: "" + +## Check we can use only "Size" to create a SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=10 %s -o %t10 +# RUN: llvm-readobj --sections --section-data %t10 | FileCheck %s --check-prefix=SIZE + +# SIZE: Name: .llvm_addrsig +# SIZE: Size: +# SIZE-SAME: 17 +# SIZE: SectionData ( +# SIZE-NEXT: 0000: 00000000 00000000 00000000 00000000 | +# SIZE-NEXT: 0010: 00 | +# SIZE-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x11 + +## Check we can use "Size" and "Content" together to create a SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=11 %s -o %t11 +# RUN: llvm-readobj --sections --section-data %t11 | FileCheck %s --check-prefix=SIZE-CONTENT + +# SIZE-CONTENT: Name: .llvm_addrsig_sizegr +# SIZE-CONTENT: Size: +# SIZE-CONTENT-SAME: 5 +# SIZE-CONTENT: SectionData ( +# SIZE-CONTENT-NEXT: 0000: 11223300 00 | +# SIZE-CONTENT-NEXT: ) + +# SIZE-CONTENT: Name: .llvm_addrsig_sizeeq +# SIZE-CONTENT: Size: +# SIZE-CONTENT-SAME: 3 +# SIZE-CONTENT: SectionData ( +# SIZE-CONTENT-NEXT: 0000: 112233 | +# SIZE-CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig_sizegr + Type: SHT_LLVM_ADDRSIG + Size: 0x5 + Content: "112233" + - Name: .llvm_addrsig_sizeeq + Type: SHT_LLVM_ADDRSIG + Size: 0x3 + Content: "112233" + +## Check that when "Size" and "Content" are used together, the size +## must be greater than or equal to the content size. + +# RUN: not yaml2obj --docnum=12 %s 2>&1 | FileCheck %s --check-prefix=SIZE-CONTENT-ERR + +# SIZE-CONTENT-ERR: error: "Size" must be greater than or equal to the content size + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x1 + Content: "1122" + +## Check we can't use "Size" and "Symbols" tags together. + +# RUN: not yaml2obj --docnum=13 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x1 + Symbols: [ ] diff --git a/llvm/tools/dsymutil/CMakeLists.txt b/llvm/tools/dsymutil/CMakeLists.txt index 19865e3d20e142..f88e6db62c3883 100644 --- a/llvm/tools/dsymutil/CMakeLists.txt +++ b/llvm/tools/dsymutil/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(DsymutilTableGen) + set(LLVM_LINK_COMPONENTS AllTargetsAsmPrinters AllTargetsCodeGens @@ -7,6 +11,7 @@ set(LLVM_LINK_COMPONENTS DebugInfoDWARF MC Object + Option Support Target ) @@ -27,6 +32,7 @@ add_llvm_tool(dsymutil DEPENDS intrinsics_gen + ${tablegen_deps} ) if(APPLE) diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td new file mode 100644 index 00000000000000..c2114c86a1a377 --- /dev/null +++ b/llvm/tools/dsymutil/Options.td @@ -0,0 +1,146 @@ +include "llvm/Option/OptParser.td" + +class F: Flag<["--", "-"], name>; + +def grp_general : OptionGroup<"Dsymutil">, HelpText<"Dsymutil Options">; + +def help: F<"help">, + HelpText<"Prints this help output.">, + Group; +def: Flag<["-"], "h">, + Alias, + HelpText<"Alias for --help">, + Group; + +def version: F<"version">, + HelpText<"Prints the dsymutil version.">, + Group; +def: Flag<["-"], "v">, + Alias, + HelpText<"Alias for --version">, + Group; + +def verbose: F<"verbose">, + HelpText<"Enable verbose mode.">, + Group; + +def verify: F<"verify">, + HelpText<"Run the DWARF verifier on the linked DWARF debug info.">, + Group; + +def no_output: F<"no-output">, + HelpText<"Do the link in memory, but do not emit the result file.">, + Group; + +def no_swiftmodule_timestamp: F<"no-swiftmodule-timestamp">, + HelpText<"Don't check timestamp for swiftmodule files.">, + Group; + +def no_odr: F<"no-odr">, + HelpText<"Do not use ODR (One Definition Rule) for type uniquing.">, + Group; + +def dump_debug_map: F<"dump-debug-map">, + HelpText<"Parse and dump the debug map to standard output. Not DWARF link will take place.">, + Group; + +def yaml_input: F<"y">, + HelpText<"Treat the input file is a YAML debug map rather than a binary.">, + Group; + +def papertrail: F<"papertrail">, + HelpText<"Embed warnings in the linked DWARF debug info.">, + Group; + +def assembly: F<"S">, + HelpText<"Output textual assembly instead of a binary dSYM companion file.">, + Group; + +def symtab: F<"symtab">, + HelpText<"Dumps the symbol table found in executable or object file(s) and exits.">, + Group; +def: Flag<["-"], "s">, + Alias, + HelpText<"Alias for --symtab">, + Group; + +def flat: F<"flat">, + HelpText<"Produce a flat dSYM file (not a bundle).">, + Group; +def: Flag<["-"], "f">, + Alias, + HelpText<"Alias for --flat">, + Group; + +def minimize: F<"minimize">, + HelpText<"When used when creating a dSYM file with Apple accelerator tables, " + "this option will suppress the emission of the .debug_inlines, " + ".debug_pubnames, and .debug_pubtypes sections since dsymutil " + "has better equivalents: .apple_names and .apple_types. When used in " + "conjunction with --update option, this option will cause redundant " + "accelerator tables to be removed.">, + Group; +def: Flag<["-"], "z">, + Alias, + HelpText<"Alias for --minimize">, + Group; + +def update: F<"update">, + HelpText<"Updates existing dSYM files to contain the latest accelerator tables and other DWARF optimizations.">, + Group; +def: Flag<["-"], "u">, + Alias, + HelpText<"Alias for --update">, + Group; + +def output: Separate<["--", "-"], "o">, + MetaVarName<"">, + HelpText<"Specify the output file. Defaults to .dwarf">, + Group; +def: Separate<["-"], "out">, + Alias, + HelpText<"Alias for --o">, + Group; +def: Joined<["--", "-"], "out=">, Alias; +def: Joined<["--", "-"], "o=">, Alias; + +def oso_prepend_path: Separate<["--", "-"], "oso-prepend-path">, + MetaVarName<"">, + HelpText<"Specify a directory to prepend to the paths of object files.">, + Group; +def: Joined<["--", "-"], "oso-prepend-path=">, Alias; + +def symbolmap: Separate<["--", "-"], "symbol-map">, + MetaVarName<"">, + HelpText<"Updates the existing dSYMs inplace using symbol map specified.">, + Group; +def: Joined<["--", "-"], "symbol-map=">, Alias; + +def arch: Separate<["--", "-"], "arch">, + MetaVarName<"">, + HelpText<"Link DWARF debug information only for specified CPU architecture" + "types. This option can be specified multiple times, once for each" + "desired architecture. All CPU architectures will be linked by" + "default.">, + Group; +def: Joined<["--", "-"], "arch=">, Alias; + +def accelerator: Separate<["--", "-"], "accelerator">, + MetaVarName<"">, + HelpText<"Specify the desired type of accelerator table. Valid options are 'Apple', 'Dwarf' and 'Default'">, + Group; +def: Joined<["--", "-"], "accelerator=">, Alias; + +def toolchain: Separate<["--", "-"], "toolchain">, + MetaVarName<"">, + HelpText<"Embed toolchain information in dSYM bundle.">, + Group; + +def threads: Separate<["--", "-"], "num-threads">, + MetaVarName<"">, + HelpText<"Specifies the maximum number of simultaneous threads to use when linking multiple architectures.">, + Group; +def: Separate<["-"], "j">, + Alias, + HelpText<"Alias for --num-threads">, + Group; diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index bf42ec73269c7c..983e86808e7e41 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -20,12 +20,16 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFVerifier.h" #include "llvm/Object/Binary.h" #include "llvm/Object/MachO.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/InitLLVM.h" @@ -43,142 +47,232 @@ #include using namespace llvm; -using namespace llvm::cl; using namespace llvm::dsymutil; using namespace object; -static OptionCategory DsymCategory("Specific Options"); -static opt Help("h", desc("Alias for -help"), Hidden); -static opt Version("v", desc("Alias for -version"), Hidden); - -static list InputFiles(Positional, OneOrMore, - desc(""), cat(DsymCategory)); - -static opt - OutputFileOpt("o", - desc("Specify the output file. default: .dwarf"), - value_desc("filename"), cat(DsymCategory)); -static alias OutputFileOptA("out", desc("Alias for -o"), - aliasopt(OutputFileOpt)); - -static opt OsoPrependPath( - "oso-prepend-path", - desc("Specify a directory to prepend to the paths of object files."), - value_desc("path"), cat(DsymCategory)); - -static opt Assembly( - "S", - desc("Output textual assembly instead of a binary dSYM companion file."), - init(false), cat(DsymCategory), cl::Hidden); - -static opt DumpStab( - "symtab", - desc("Dumps the symbol table found in executable or object file(s) and\n" - "exits."), - init(false), cat(DsymCategory)); -static alias DumpStabA("s", desc("Alias for --symtab"), aliasopt(DumpStab)); - -static opt FlatOut("flat", - desc("Produce a flat dSYM file (not a bundle)."), - init(false), cat(DsymCategory)); -static alias FlatOutA("f", desc("Alias for --flat"), aliasopt(FlatOut)); - -static opt Minimize( - "minimize", - desc("When used when creating a dSYM file with Apple accelerator tables,\n" - "this option will suppress the emission of the .debug_inlines, \n" - ".debug_pubnames, and .debug_pubtypes sections since dsymutil \n" - "has better equivalents: .apple_names and .apple_types. When used in\n" - "conjunction with --update option, this option will cause redundant\n" - "accelerator tables to be removed."), - init(false), cat(DsymCategory)); -static alias MinimizeA("z", desc("Alias for --minimize"), aliasopt(Minimize)); - -static opt Update( - "update", - desc("Updates existing dSYM files to contain the latest accelerator\n" - "tables and other DWARF optimizations."), - init(false), cat(DsymCategory)); -static alias UpdateA("u", desc("Alias for --update"), aliasopt(Update)); - -static opt SymbolMap( - "symbol-map", - desc("Updates the existing dSYMs inplace using symbol map specified."), - value_desc("bcsymbolmap"), cat(DsymCategory)); - -static cl::opt AcceleratorTable( - "accelerator", cl::desc("Output accelerator tables."), - cl::values(clEnumValN(AccelTableKind::Default, "Default", - "Default for input."), - clEnumValN(AccelTableKind::Apple, "Apple", "Apple"), - clEnumValN(AccelTableKind::Dwarf, "Dwarf", "DWARF")), - cl::init(AccelTableKind::Default), cat(DsymCategory)); - -static opt NumThreads( - "num-threads", - desc("Specifies the maximum number (n) of simultaneous threads to use\n" - "when linking multiple architectures."), - value_desc("n"), init(0), cat(DsymCategory)); -static alias NumThreadsA("j", desc("Alias for --num-threads"), - aliasopt(NumThreads)); - -static opt Verbose("verbose", desc("Verbosity level"), init(false), - cat(DsymCategory)); - -static opt - NoOutput("no-output", - desc("Do the link in memory, but do not emit the result file."), - init(false), cat(DsymCategory)); - -static opt - NoTimestamp("no-swiftmodule-timestamp", - desc("Don't check timestamp for swiftmodule files."), - init(false), cat(DsymCategory)); - -static list ArchFlags( - "arch", - desc("Link DWARF debug information only for specified CPU architecture\n" - "types. This option can be specified multiple times, once for each\n" - "desired architecture. All CPU architectures will be linked by\n" - "default."), - value_desc("arch"), ZeroOrMore, cat(DsymCategory)); - -static opt - NoODR("no-odr", - desc("Do not use ODR (One Definition Rule) for type uniquing."), - init(false), cat(DsymCategory)); - -static opt DumpDebugMap( - "dump-debug-map", - desc("Parse and dump the debug map to standard output. Not DWARF link " - "will take place."), - init(false), cat(DsymCategory)); - -static opt InputIsYAMLDebugMap( - "y", desc("Treat the input file is a YAML debug map rather than a binary."), - init(false), cat(DsymCategory)); - -static opt Verify("verify", desc("Verify the linked DWARF debug info."), - cat(DsymCategory)); - -static opt - Toolchain("toolchain", desc("Embed toolchain information in dSYM bundle."), - cat(DsymCategory)); - -static opt - PaperTrailWarnings("papertrail", - desc("Embed warnings in the linked DWARF debug info."), - cat(DsymCategory)); - -static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { - if (NoOutput) - return Error::success(); +namespace { +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OPT_##ID, +#include "Options.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE; +#include "Options.inc" +#undef PREFIX + +const opt::OptTable::Info InfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + { \ + PREFIX, NAME, HELPTEXT, \ + METAVAR, OPT_##ID, opt::Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, \ + OPT_##ALIAS, ALIASARGS, VALUES}, +#include "Options.inc" +#undef OPTION +}; + +class DsymutilOptTable : public opt::OptTable { +public: + DsymutilOptTable() : OptTable(InfoTable) {} +}; +} // namespace + +struct DsymutilOptions { + bool DumpDebugMap = false; + bool DumpStab = false; + bool Flat = false; + bool InputIsYAMLDebugMap = false; + bool PaperTrailWarnings = false; + bool Verify = false; + std::string SymbolMap; + std::string OutputFile; + std::string Toolchain; + std::vector Archs; + std::vector InputFiles; + unsigned NumThreads; + dsymutil::LinkOptions LinkOpts; +}; + +/// Return a list of input files. This function has logic for dealing with the +/// special case where we might have dSYM bundles as input. The function +/// returns an error when the directory structure doesn't match that of a dSYM +/// bundle. +static Expected> getInputs(opt::InputArgList &Args, + bool DsymAsInput) { + std::vector InputFiles; + for (auto *File : Args.filtered(OPT_INPUT)) + InputFiles.push_back(File->getValue()); + + if (!DsymAsInput) + return InputFiles; + + // If we are updating, we might get dSYM bundles as input. + std::vector Inputs; + for (const auto &Input : InputFiles) { + if (!sys::fs::is_directory(Input)) { + Inputs.push_back(Input); + continue; + } + + // Make sure that we're dealing with a dSYM bundle. + SmallString<256> BundlePath(Input); + sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); + if (!sys::fs::is_directory(BundlePath)) + return make_error( + Input + " is a directory, but doesn't look like a dSYM bundle.", + inconvertibleErrorCode()); + + // Create a directory iterator to iterate over all the entries in the + // bundle. + std::error_code EC; + sys::fs::directory_iterator DirIt(BundlePath, EC); + sys::fs::directory_iterator DirEnd; + if (EC) + return errorCodeToError(EC); + + // Add each entry to the list of inputs. + while (DirIt != DirEnd) { + Inputs.push_back(DirIt->path()); + DirIt.increment(EC); + if (EC) + return errorCodeToError(EC); + } + } + return Inputs; +} + +// Verify that the given combination of options makes sense. +static Error verifyOptions(const DsymutilOptions &Options) { + if (Options.LinkOpts.Update && + std::find(Options.InputFiles.begin(), Options.InputFiles.end(), "-") != + Options.InputFiles.end()) { + // FIXME: We cannot use stdin for an update because stdin will be + // consumed by the BinaryHolder during the debugmap parsing, and + // then we will want to consume it again in DwarfLinker. If we + // used a unique BinaryHolder object that could cache multiple + // binaries this restriction would go away. + return make_error( + "standard input cannot be used as input for a dSYM update.", + errc::invalid_argument); + } + + if (!Options.Flat && Options.OutputFile == "-") + return make_error( + "cannot emit to standard output without --flat.", + errc::invalid_argument); + + if (Options.InputFiles.size() > 1 && Options.Flat && + !Options.OutputFile.empty()) + return make_error( + "cannot use -o with multiple inputs in flat mode.", + errc::invalid_argument); + + if (Options.PaperTrailWarnings && Options.InputIsYAMLDebugMap) + return make_error( + "paper trail warnings are not supported for YAML input.", + errc::invalid_argument); + + return Error::success(); +} + +static Expected getAccelTableKind(opt::InputArgList &Args) { + if (opt::Arg *Accelerator = Args.getLastArg(OPT_accelerator)) { + StringRef S = Accelerator->getValue(); + if (S == "Apple") + return AccelTableKind::Apple; + if (S == "Dwarf") + return AccelTableKind::Dwarf; + if (S == "Default") + return AccelTableKind::Default; + return make_error( + "invalid accelerator type specified: '" + S + + "'. Support values are 'Apple', 'Dwarf' and 'Default'.", + inconvertibleErrorCode()); + } + return AccelTableKind::Default; +} + +/// Parses the command line options into the LinkOptions struct and performs +/// some sanity checking. Returns an error in case the latter fails. +static Expected getOptions(opt::InputArgList &Args) { + DsymutilOptions Options; + + Options.DumpDebugMap = Args.hasArg(OPT_dump_debug_map); + Options.DumpStab = Args.hasArg(OPT_symtab); + Options.Flat = Args.hasArg(OPT_flat); + Options.InputIsYAMLDebugMap = Args.hasArg(OPT_yaml_input); + Options.PaperTrailWarnings = Args.hasArg(OPT_papertrail); + Options.Verify = Args.hasArg(OPT_verify); + + Options.LinkOpts.Minimize = Args.hasArg(OPT_minimize); + Options.LinkOpts.NoODR = Args.hasArg(OPT_no_odr); + Options.LinkOpts.NoOutput = Args.hasArg(OPT_no_output); + Options.LinkOpts.NoTimestamp = Args.hasArg(OPT_no_swiftmodule_timestamp); + Options.LinkOpts.Update = Args.hasArg(OPT_update); + Options.LinkOpts.Verbose = Args.hasArg(OPT_verbose); + + if (Expected AccelKind = getAccelTableKind(Args)) { + Options.LinkOpts.TheAccelTableKind = *AccelKind; + } else { + return AccelKind.takeError(); + } + + if (opt::Arg *SymbolMap = Args.getLastArg(OPT_symbolmap)) + Options.SymbolMap = SymbolMap->getValue(); + + if (Args.hasArg(OPT_symbolmap)) + Options.LinkOpts.Update = true; + + if (Expected> InputFiles = + getInputs(Args, Options.LinkOpts.Update)) { + Options.InputFiles = std::move(*InputFiles); + } else { + return InputFiles.takeError(); + } + + for (auto *Arch : Args.filtered(OPT_arch)) + Options.Archs.push_back(Arch->getValue()); + + if (opt::Arg *OsoPrependPath = Args.getLastArg(OPT_oso_prepend_path)) + Options.LinkOpts.PrependPath = OsoPrependPath->getValue(); + + if (opt::Arg *OutputFile = Args.getLastArg(OPT_output)) + Options.OutputFile = OutputFile->getValue(); + + if (opt::Arg *Toolchain = Args.getLastArg(OPT_toolchain)) + Options.Toolchain = Toolchain->getValue(); + + if (Args.hasArg(OPT_assembly)) + Options.LinkOpts.FileType = OutputFileType::Assembly; + + if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) + Options.LinkOpts.Threads = atoi(NumThreads->getValue()); + else + Options.LinkOpts.Threads = thread::hardware_concurrency(); + if (Options.DumpDebugMap || Options.LinkOpts.Verbose) + Options.LinkOpts.Threads = 1; + + if (getenv("RC_DEBUG_OPTIONS")) + Options.PaperTrailWarnings = true; + + if (Error E = verifyOptions(Options)) + return std::move(E); + return Options; +} + +static Error createPlistFile(StringRef Bin, StringRef BundleRoot, + StringRef Toolchain) { // Create plist file to write to. - llvm::SmallString<128> InfoPlist(BundleRoot); - llvm::sys::path::append(InfoPlist, "Contents/Info.plist"); + SmallString<128> InfoPlist(BundleRoot); + sys::path::append(InfoPlist, "Contents/Info.plist"); std::error_code EC; - llvm::raw_fd_ostream PL(InfoPlist, EC, llvm::sys::fs::OF_Text); + raw_fd_ostream PL(InfoPlist, EC, sys::fs::OF_Text); if (EC) return make_error( "cannot create Plist: " + toString(errorCodeToError(EC)), EC); @@ -186,9 +280,9 @@ static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { CFBundleInfo BI = getBundleInfo(Bin); if (BI.IDStr.empty()) { - llvm::StringRef BundleID = *llvm::sys::path::rbegin(BundleRoot); - if (llvm::sys::path::extension(BundleRoot) == ".dSYM") - BI.IDStr = llvm::sys::path::stem(BundleID); + StringRef BundleID = *sys::path::rbegin(BundleRoot); + if (sys::path::extension(BundleRoot) == ".dSYM") + BI.IDStr = sys::path::stem(BundleID); else BI.IDStr = BundleID; } @@ -236,21 +330,18 @@ static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { return Error::success(); } -static Error createBundleDir(llvm::StringRef BundleBase) { - if (NoOutput) - return Error::success(); - - llvm::SmallString<128> Bundle(BundleBase); - llvm::sys::path::append(Bundle, "Contents", "Resources", "DWARF"); +static Error createBundleDir(StringRef BundleBase) { + SmallString<128> Bundle(BundleBase); + sys::path::append(Bundle, "Contents", "Resources", "DWARF"); if (std::error_code EC = - create_directories(Bundle.str(), true, llvm::sys::fs::perms::all_all)) + create_directories(Bundle.str(), true, sys::fs::perms::all_all)) return make_error( "cannot create bundle: " + toString(errorCodeToError(EC)), EC); return Error::success(); } -static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) { +static bool verify(StringRef OutputFile, StringRef Arch, bool Verbose) { if (OutputFile == "-") { WithColor::warning() << "verification skipped for " << Arch << "because writing to stdout.\n"; @@ -280,33 +371,34 @@ static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) { namespace { struct OutputLocation { - OutputLocation(std::string DWARFFile, - llvm::Optional ResourceDir = {}) + OutputLocation(std::string DWARFFile, Optional ResourceDir = {}) : DWARFFile(DWARFFile), ResourceDir(ResourceDir) {} /// This method is a workaround for older compilers. - llvm::Optional getResourceDir() const { return ResourceDir; } + Optional getResourceDir() const { return ResourceDir; } std::string DWARFFile; - llvm::Optional ResourceDir; + Optional ResourceDir; }; -} +} // namespace -static Expected getOutputFileName(llvm::StringRef InputFile) { - if (OutputFileOpt == "-") - return OutputLocation(OutputFileOpt); +static Expected +getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) { + if (Options.OutputFile == "-") + return OutputLocation(Options.OutputFile); // When updating, do in place replacement. - if (OutputFileOpt.empty() && (Update || !SymbolMap.empty())) + if (Options.OutputFile.empty() && + (Options.LinkOpts.Update || !Options.SymbolMap.empty())) return OutputLocation(InputFile); // If a flat dSYM has been requested, things are pretty simple. - if (FlatOut) { - if (OutputFileOpt.empty()) { + if (Options.Flat) { + if (Options.OutputFile.empty()) { if (InputFile == "-") return OutputLocation{"a.out.dwarf", {}}; return OutputLocation((InputFile + ".dwarf").str()); } - return OutputLocation(OutputFileOpt); + return OutputLocation(Options.OutputFile); } // We need to create/update a dSYM bundle. @@ -317,193 +409,87 @@ static Expected getOutputFileName(llvm::StringRef InputFile) { // Resources/ // DWARF/ // - std::string DwarfFile = - InputFile == "-" ? llvm::StringRef("a.out") : InputFile; - llvm::SmallString<128> Path(OutputFileOpt); + std::string DwarfFile = InputFile == "-" ? StringRef("a.out") : InputFile; + SmallString<128> Path(Options.OutputFile); if (Path.empty()) Path = DwarfFile + ".dSYM"; - if (auto E = createBundleDir(Path)) - return std::move(E); - if (auto E = createPlistFile(DwarfFile, Path)) - return std::move(E); + if (!Options.LinkOpts.NoOutput) { + if (auto E = createBundleDir(Path)) + return std::move(E); + if (auto E = createPlistFile(DwarfFile, Path, Options.Toolchain)) + return std::move(E); + } - llvm::sys::path::append(Path, "Contents", "Resources"); + sys::path::append(Path, "Contents", "Resources"); std::string ResourceDir = Path.str(); - llvm::sys::path::append(Path, "DWARF", llvm::sys::path::filename(DwarfFile)); + sys::path::append(Path, "DWARF", sys::path::filename(DwarfFile)); return OutputLocation(Path.str(), ResourceDir); } -/// Parses the command line options into the LinkOptions struct and performs -/// some sanity checking. Returns an error in case the latter fails. -static Expected getOptions() { - LinkOptions Options; - - Options.Verbose = Verbose; - Options.NoOutput = NoOutput; - Options.NoODR = NoODR; - Options.Minimize = Minimize; - Options.Update = Update; - Options.NoTimestamp = NoTimestamp; - Options.PrependPath = OsoPrependPath; - Options.TheAccelTableKind = AcceleratorTable; - - if (!SymbolMap.empty()) - Options.Update = true; - - if (Assembly) - Options.FileType = OutputFileType::Assembly; - - if (Options.Update && std::find(InputFiles.begin(), InputFiles.end(), "-") != - InputFiles.end()) { - // FIXME: We cannot use stdin for an update because stdin will be - // consumed by the BinaryHolder during the debugmap parsing, and - // then we will want to consume it again in DwarfLinker. If we - // used a unique BinaryHolder object that could cache multiple - // binaries this restriction would go away. - return make_error( - "standard input cannot be used as input for a dSYM update.", - inconvertibleErrorCode()); - } - - if (NumThreads == 0) - Options.Threads = llvm::thread::hardware_concurrency(); - else - Options.Threads = NumThreads; - if (DumpDebugMap || Verbose) - Options.Threads = 1; - - return Options; -} - -/// Return a list of input files. This function has logic for dealing with the -/// special case where we might have dSYM bundles as input. The function -/// returns an error when the directory structure doesn't match that of a dSYM -/// bundle. -static Expected> getInputs(bool DsymAsInput) { - if (!DsymAsInput) - return InputFiles; - - // If we are updating, we might get dSYM bundles as input. - std::vector Inputs; - for (const auto &Input : InputFiles) { - if (!llvm::sys::fs::is_directory(Input)) { - Inputs.push_back(Input); - continue; - } - - // Make sure that we're dealing with a dSYM bundle. - SmallString<256> BundlePath(Input); - sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); - if (!llvm::sys::fs::is_directory(BundlePath)) - return make_error( - Input + " is a directory, but doesn't look like a dSYM bundle.", - inconvertibleErrorCode()); - - // Create a directory iterator to iterate over all the entries in the - // bundle. - std::error_code EC; - llvm::sys::fs::directory_iterator DirIt(BundlePath, EC); - llvm::sys::fs::directory_iterator DirEnd; - if (EC) - return errorCodeToError(EC); - - // Add each entry to the list of inputs. - while (DirIt != DirEnd) { - Inputs.push_back(DirIt->path()); - DirIt.increment(EC); - if (EC) - return errorCodeToError(EC); - } - } - return Inputs; -} - int main(int argc, char **argv) { InitLLVM X(argc, argv); + // Parse arguments. + DsymutilOptTable T; + unsigned MAI; + unsigned MAC; + ArrayRef ArgsArr = makeArrayRef(argv + 1, argc - 1); + opt::InputArgList Args = T.ParseArgs(ArgsArr, MAI, MAC); + void *P = (void *)(intptr_t)getOutputFileName; - std::string SDKPath = llvm::sys::fs::getMainExecutable(argv[0], P); - SDKPath = llvm::sys::path::parent_path(SDKPath); - - HideUnrelatedOptions({&DsymCategory, &ColorCategory}); - llvm::cl::ParseCommandLineOptions( - argc, argv, - "manipulate archived DWARF debug symbol files.\n\n" - "dsymutil links the DWARF debug information found in the object files\n" - "for the executable by using debug symbols information\n" - "contained in its symbol table.\n"); - - if (Help) { - PrintHelpMessage(); + std::string SDKPath = sys::fs::getMainExecutable(argv[0], P); + SDKPath = sys::path::parent_path(SDKPath); + + if (Args.hasArg(OPT_help)) { + T.PrintHelp( + outs(), (std::string(argv[0]) + " [options] ").c_str(), + "manipulate archived DWARF debug symbol files.\n\n" + "dsymutil links the DWARF debug information found in the object files\n" + "for the executable by using debug symbols information\n" + "contained in its symbol table.\n", + false); return 0; } - if (Version) { - llvm::cl::PrintVersionMessage(); + if (Args.hasArg(OPT_version)) { + cl::PrintVersionMessage(); return 0; } - auto OptionsOrErr = getOptions(); + auto OptionsOrErr = getOptions(Args); if (!OptionsOrErr) { WithColor::error() << toString(OptionsOrErr.takeError()); return 1; } - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); - - auto InputsOrErr = getInputs(OptionsOrErr->Update); - if (!InputsOrErr) { - WithColor::error() << toString(InputsOrErr.takeError()) << '\n'; - return 1; - } - - if (!FlatOut && OutputFileOpt == "-") { - WithColor::error() << "cannot emit to standard output without --flat\n"; - return 1; - } - - if (InputsOrErr->size() > 1 && FlatOut && !OutputFileOpt.empty()) { - WithColor::error() << "cannot use -o with multiple inputs in flat mode\n"; - return 1; - } - - if (InputFiles.size() > 1 && !SymbolMap.empty() && - !llvm::sys::fs::is_directory(SymbolMap)) { - WithColor::error() << "when unobfuscating multiple files, --symbol-map " - << "needs to point to a directory.\n"; - return 1; - } + auto &Options = *OptionsOrErr; - if (getenv("RC_DEBUG_OPTIONS")) - PaperTrailWarnings = true; + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllTargets(); + InitializeAllAsmPrinters(); - if (PaperTrailWarnings && InputIsYAMLDebugMap) - WithColor::warning() - << "Paper trail warnings are not supported for YAML input"; - - for (const auto &Arch : ArchFlags) + for (const auto &Arch : Options.Archs) if (Arch != "*" && Arch != "all" && - !llvm::object::MachOObjectFile::isValidArch(Arch)) { + !object::MachOObjectFile::isValidArch(Arch)) { WithColor::error() << "unsupported cpu architecture: '" << Arch << "'\n"; return 1; } - SymbolMapLoader SymMapLoader(SymbolMap); + SymbolMapLoader SymMapLoader(Options.SymbolMap); - for (auto &InputFile : *InputsOrErr) { + for (auto &InputFile : Options.InputFiles) { // Dump the symbol table for each input file and requested arch - if (DumpStab) { - if (!dumpStab(InputFile, ArchFlags, OsoPrependPath)) + if (Options.DumpStab) { + if (!dumpStab(InputFile, Options.Archs, Options.LinkOpts.PrependPath)) return 1; continue; } auto DebugMapPtrsOrErr = - parseDebugMap(InputFile, ArchFlags, OsoPrependPath, PaperTrailWarnings, - Verbose, InputIsYAMLDebugMap); + parseDebugMap(InputFile, Options.Archs, Options.LinkOpts.PrependPath, + Options.PaperTrailWarnings, Options.LinkOpts.Verbose, + Options.InputIsYAMLDebugMap); if (auto EC = DebugMapPtrsOrErr.getError()) { WithColor::error() << "cannot parse the debug map for '" << InputFile @@ -511,12 +497,12 @@ int main(int argc, char **argv) { return 1; } - if (OptionsOrErr->Update) { + if (Options.LinkOpts.Update) { // The debug map should be empty. Add one object file corresponding to // the input file. for (auto &Map : *DebugMapPtrsOrErr) Map->addDebugMapObject(InputFile, - llvm::sys::TimePoint()); + sys::TimePoint()); } // Ensure that the debug map is not empty (anymore). @@ -529,26 +515,27 @@ int main(int argc, char **argv) { BinaryHolder BinHolder; unsigned ThreadCount = - std::min(OptionsOrErr->Threads, DebugMapPtrsOrErr->size()); - llvm::ThreadPool Threads(ThreadCount); + std::min(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size()); + ThreadPool Threads(ThreadCount); // If there is more than one link to execute, we need to generate // temporary files. - bool NeedsTempFiles = - !DumpDebugMap && (OutputFileOpt != "-") && - (DebugMapPtrsOrErr->size() != 1 || OptionsOrErr->Update); + const bool NeedsTempFiles = + !Options.DumpDebugMap && (Options.OutputFile != "-") && + (DebugMapPtrsOrErr->size() != 1 || Options.LinkOpts.Update); + const bool Verify = Options.Verify && !Options.LinkOpts.NoOutput; - llvm::SmallVector TempFiles; + SmallVector TempFiles; std::atomic_char AllOK(1); for (auto &Map : *DebugMapPtrsOrErr) { - if (Verbose || DumpDebugMap) - Map->print(llvm::outs()); + if (Options.LinkOpts.Verbose || Options.DumpDebugMap) + Map->print(outs()); - if (DumpDebugMap) + if (Options.DumpDebugMap) continue; - if (!SymbolMap.empty()) - OptionsOrErr->Translator = SymMapLoader.Load(InputFile, *Map); + if (!Options.SymbolMap.empty()) + Options.LinkOpts.Translator = SymMapLoader.Load(InputFile, *Map); if (Map->begin() == Map->end()) WithColor::warning() @@ -560,12 +547,12 @@ int main(int argc, char **argv) { std::shared_ptr OS; Expected OutputLocationOrErr = - getOutputFileName(InputFile); + getOutputFileName(InputFile, Options); if (!OutputLocationOrErr) { WithColor::error() << toString(OutputLocationOrErr.takeError()); return 1; } - OptionsOrErr->ResourceDir = OutputLocationOrErr->getResourceDir(); + Options.LinkOpts.ResourceDir = OutputLocationOrErr->getResourceDir(); std::string OutputFile = OutputLocationOrErr->DWARFFile; if (NeedsTempFiles) { @@ -583,8 +570,8 @@ int main(int argc, char **argv) { OutputFile = TempFile.TmpName; } else { std::error_code EC; - OS = std::make_shared(NoOutput ? "-" : OutputFile, EC, - sys::fs::OF_None); + OS = std::make_shared( + Options.LinkOpts.NoOutput ? "-" : OutputFile, EC, sys::fs::OF_None); if (EC) { WithColor::error() << OutputFile << ": " << EC.message(); return 1; @@ -596,17 +583,18 @@ int main(int argc, char **argv) { AllOK.fetch_and( linkDwarf(*Stream, BinHolder, *Map, std::move(Options))); Stream->flush(); - if (Verify && !NoOutput) - AllOK.fetch_and(verify(OutputFile, Map->getTriple().getArchName())); + if (Verify) + AllOK.fetch_and(verify(OutputFile, Map->getTriple().getArchName(), + Options.Verbose)); }; // FIXME: The DwarfLinker can have some very deep recursion that can max // out the (significantly smaller) stack when using threads. We don't // want this limitation when we only have a single thread. if (ThreadCount == 1) - LinkLambda(OS, *OptionsOrErr); + LinkLambda(OS, Options.LinkOpts); else - Threads.async(LinkLambda, OS, *OptionsOrErr); + Threads.async(LinkLambda, OS, Options.LinkOpts); } Threads.wait(); @@ -615,14 +603,15 @@ int main(int argc, char **argv) { return 1; if (NeedsTempFiles) { - Expected OutputLocationOrErr = getOutputFileName(InputFile); + Expected OutputLocationOrErr = + getOutputFileName(InputFile, Options); if (!OutputLocationOrErr) { WithColor::error() << toString(OutputLocationOrErr.takeError()); return 1; } if (!MachOUtils::generateUniversalBinary(TempFiles, OutputLocationOrErr->DWARFFile, - *OptionsOrErr, SDKPath)) + Options.LinkOpts, SDKPath)) return 1; } } diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c index ff96037d4af104..e498de6a745ad9 100644 --- a/llvm/tools/llvm-c-test/debuginfo.c +++ b/llvm/tools/llvm-c-test/debuginfo.c @@ -170,6 +170,27 @@ int llvm_test_dibuilder(void) { LLVMDIBuilderInsertDbgValueAtEnd(DIB, FooVal1, FooVar1, FooVarValueExpr, FooVarsLocation, FooVarBlock); + LLVMMetadataRef MacroFile = + LLVMDIBuilderCreateTempMacroFile(DIB, NULL, 0, File); + LLVMDIBuilderCreateMacro(DIB, MacroFile, 0, LLVMDWARFMacinfoRecordTypeDefine, + "SIMPLE_DEFINE", 13, NULL, 0); + LLVMDIBuilderCreateMacro(DIB, MacroFile, 0, LLVMDWARFMacinfoRecordTypeDefine, + "VALUE_DEFINE", 12, "1", 1); + + LLVMMetadataRef EnumeratorTestA = + LLVMDIBuilderCreateEnumerator(DIB, "Test_A", strlen("Test_A"), 0, true); + LLVMMetadataRef EnumeratorTestB = + LLVMDIBuilderCreateEnumerator(DIB, "Test_B", strlen("Test_B"), 1, true); + LLVMMetadataRef EnumeratorTestC = + LLVMDIBuilderCreateEnumerator(DIB, "Test_B", strlen("Test_C"), 2, true); + LLVMMetadataRef EnumeratorsTest[] = {EnumeratorTestA, EnumeratorTestB, + EnumeratorTestC}; + LLVMMetadataRef EnumTest = LLVMDIBuilderCreateEnumerationType( + DIB, NameSpace, "EnumTest", strlen("EnumTest"), File, 0, 64, 0, + EnumeratorsTest, 3, Int64Ty); + LLVMAddNamedMetadataOperand( + M, "EnumTest", LLVMMetadataAsValue(LLVMGetModuleContext(M), EnumTest)); + LLVMDIBuilderFinalize(DIB); char *MStr = LLVMPrintModuleToString(M); diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp index d6bf02b69d10b6..c29ad783a9e61c 100644 --- a/llvm/tools/llvm-dwarfdump/Statistics.cpp +++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp @@ -600,7 +600,7 @@ bool collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, printDatum(OS, "total vars procesed by location statistics", LocStats.NumVar); printLocationStats(OS, "vars", LocStats.VarLocStats); printLocationStats(OS, "vars (excluding the debug entry values)", - LocStats.ParamNonEntryValLocStats); + LocStats.VarNonEntryValLocStats); OS << "}\n"; LLVM_DEBUG( llvm::dbgs() << "Total Availability: " diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp index 954471659e5931..860db92a81dac1 100644 --- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp @@ -79,7 +79,8 @@ class LoopSnippetRepetitor : public SnippetRepetitor { for (const auto &LiveIn : Entry.MBB->liveins()) Loop.MBB->addLiveIn(LiveIn); Loop.addInstructions(Instructions); - ET.decrementLoopCounterAndLoop(*Loop.MBB, State.getInstrInfo()); + ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, + State.getInstrInfo()); // Set up the exit basic block. Loop.MBB->addSuccessor(Exit.MBB, llvm::BranchProbability::getZero()); diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h index 4b0c9d17dd7fed..70313a7a2f7ac4 100644 --- a/llvm/tools/llvm-exegesis/lib/Target.h +++ b/llvm/tools/llvm-exegesis/lib/Target.h @@ -95,7 +95,8 @@ class ExegesisTarget { } // Adds the code to decrement the loop counter and - virtual void decrementLoopCounterAndLoop(MachineBasicBlock &MBB, + virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const { llvm_unreachable("decrementLoopCounterAndBranch() requires " "getLoopCounterRegister() > 0"); diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp index bf008e8bbc7a51..ce66610891d0af 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -448,7 +448,8 @@ class ExegesisX86Target : public ExegesisTarget { void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const override; - void decrementLoopCounterAndLoop(MachineBasicBlock &MBB, + void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const override; std::vector setRegTo(const llvm::MCSubtargetInfo &STI, @@ -558,14 +559,15 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, SetOp(MemOpIdx + 4, MCOperand::createReg(0)); // Segment } -void ExegesisX86Target::decrementLoopCounterAndLoop( - MachineBasicBlock &MBB, const llvm::MCInstrInfo &MII) const { +void ExegesisX86Target::decrementLoopCounterAndJump( + MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, + const llvm::MCInstrInfo &MII) const { BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8)) .addDef(kLoopCounterReg) .addUse(kLoopCounterReg) .addImm(-1); BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1)) - .addMBB(&MBB) + .addMBB(&TargetMBB) .addImm(X86::COND_NE); } diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp index 067c38a56cd0c3..9488dfe9b75d9a 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp @@ -26,53 +26,55 @@ static bool isMachOStubsSection(Section &S) { return S.getName() == "$__STUBS"; } -static Expected getFirstRelocationEdge(AtomGraph &G, DefinedAtom &DA) { - auto EItr = std::find_if(DA.edges().begin(), DA.edges().end(), +static Expected getFirstRelocationEdge(LinkGraph &G, Block &B) { + auto EItr = std::find_if(B.edges().begin(), B.edges().end(), [](Edge &E) { return E.isRelocation(); }); - if (EItr == DA.edges().end()) + if (EItr == B.edges().end()) return make_error("GOT entry in " + G.getName() + ", \"" + - DA.getSection().getName() + + B.getSection().getName() + "\" has no relocations", inconvertibleErrorCode()); return *EItr; } -static Expected getMachOGOTTarget(AtomGraph &G, DefinedAtom &DA) { - auto E = getFirstRelocationEdge(G, DA); +static Expected getMachOGOTTarget(LinkGraph &G, Block &B) { + auto E = getFirstRelocationEdge(G, B); if (!E) return E.takeError(); - auto &TA = E->getTarget(); - if (!TA.hasName()) - return make_error("GOT entry in " + G.getName() + ", \"" + - DA.getSection().getName() + - "\" points to anonymous " - "atom", - inconvertibleErrorCode()); - if (TA.isDefined() || TA.isAbsolute()) + auto &TargetSym = E->getTarget(); + if (!TargetSym.hasName()) return make_error( - "GOT entry \"" + TA.getName() + "\" in " + G.getName() + ", \"" + - DA.getSection().getName() + "\" does not point to an external atom", + "GOT entry in " + G.getName() + ", \"" + + TargetSym.getBlock().getSection().getName() + + "\" points to anonymous " + "symbol", inconvertibleErrorCode()); - return TA; + if (TargetSym.isDefined() || TargetSym.isAbsolute()) + return make_error( + "GOT entry \"" + TargetSym.getName() + "\" in " + G.getName() + ", \"" + + TargetSym.getBlock().getSection().getName() + + "\" does not point to an external symbol", + inconvertibleErrorCode()); + return TargetSym; } -static Expected getMachOStubTarget(AtomGraph &G, DefinedAtom &DA) { - auto E = getFirstRelocationEdge(G, DA); +static Expected getMachOStubTarget(LinkGraph &G, Block &B) { + auto E = getFirstRelocationEdge(G, B); if (!E) return E.takeError(); - auto &GOTA = E->getTarget(); - if (!GOTA.isDefined() || - !isMachOGOTSection(static_cast(GOTA).getSection())) - return make_error("Stubs entry in " + G.getName() + ", \"" + - DA.getSection().getName() + - "\" does not point to GOT entry", - inconvertibleErrorCode()); - return getMachOGOTTarget(G, static_cast(GOTA)); + auto &GOTSym = E->getTarget(); + if (!GOTSym.isDefined() || !isMachOGOTSection(GOTSym.getBlock().getSection())) + return make_error( + "Stubs entry in " + G.getName() + ", \"" + + GOTSym.getBlock().getSection().getName() + + "\" does not point to GOT entry", + inconvertibleErrorCode()); + return getMachOGOTTarget(G, GOTSym.getBlock()); } namespace llvm { -Error registerMachOStubsAndGOT(Session &S, AtomGraph &G) { +Error registerMachOStubsAndGOT(Session &S, LinkGraph &G) { auto FileName = sys::path::filename(G.getName()); if (S.FileInfos.count(FileName)) { return make_error("When -check is passed, file names must be " @@ -88,12 +90,12 @@ Error registerMachOStubsAndGOT(Session &S, AtomGraph &G) { for (auto &Sec : G.sections()) { LLVM_DEBUG({ dbgs() << " Section \"" << Sec.getName() << "\": " - << (Sec.atoms_empty() ? "empty. skipping." : "processing...") + << (Sec.symbols_empty() ? "empty. skipping." : "processing...") << "\n"; }); // Skip empty sections. - if (Sec.atoms_empty()) + if (Sec.symbols_empty()) continue; if (FileInfo.SectionInfos.count(Sec.getName())) @@ -105,54 +107,65 @@ Error registerMachOStubsAndGOT(Session &S, AtomGraph &G) { bool isGOTSection = isMachOGOTSection(Sec); bool isStubsSection = isMachOStubsSection(Sec); - auto *FirstAtom = *Sec.atoms().begin(); - auto *LastAtom = FirstAtom; - for (auto *DA : Sec.atoms()) { - if (DA->getAddress() < FirstAtom->getAddress()) - FirstAtom = DA; - if (DA->getAddress() > LastAtom->getAddress()) - LastAtom = DA; + bool SectionContainsContent = false; + bool SectionContainsZeroFill = false; + + auto *FirstSym = *Sec.symbols().begin(); + auto *LastSym = FirstSym; + for (auto *Sym : Sec.symbols()) { + if (Sym->getAddress() < FirstSym->getAddress()) + FirstSym = Sym; + if (Sym->getAddress() > LastSym->getAddress()) + LastSym = Sym; if (isGOTSection) { - if (Sec.isZeroFill()) - return make_error("Content atom in zero-fill section", + if (Sym->isSymbolZeroFill()) + return make_error("zero-fill atom in GOT section", inconvertibleErrorCode()); - if (auto TA = getMachOGOTTarget(G, *DA)) { - FileInfo.GOTEntryInfos[TA->getName()] = {DA->getContent(), - DA->getAddress()}; - } else - return TA.takeError(); + if (auto TS = getMachOGOTTarget(G, Sym->getBlock())) + FileInfo.GOTEntryInfos[TS->getName()] = {Sym->getSymbolContent(), + Sym->getAddress()}; + else + return TS.takeError(); + SectionContainsContent = true; } else if (isStubsSection) { - if (Sec.isZeroFill()) - return make_error("Content atom in zero-fill section", + if (Sym->isSymbolZeroFill()) + return make_error("zero-fill atom in Stub section", inconvertibleErrorCode()); - if (auto TA = getMachOStubTarget(G, *DA)) - FileInfo.StubInfos[TA->getName()] = {DA->getContent(), - DA->getAddress()}; + if (auto TS = getMachOStubTarget(G, Sym->getBlock())) + FileInfo.StubInfos[TS->getName()] = {Sym->getSymbolContent(), + Sym->getAddress()}; else - return TA.takeError(); - } else if (DA->hasName() && DA->isGlobal()) { - if (DA->isZeroFill()) - S.SymbolInfos[DA->getName()] = {DA->getSize(), DA->getAddress()}; - else { - if (Sec.isZeroFill()) - return make_error("Content atom in zero-fill section", - inconvertibleErrorCode()); - S.SymbolInfos[DA->getName()] = {DA->getContent(), DA->getAddress()}; + return TS.takeError(); + SectionContainsContent = true; + } else if (Sym->hasName()) { + if (Sym->isSymbolZeroFill()) { + S.SymbolInfos[Sym->getName()] = {Sym->getSize(), Sym->getAddress()}; + SectionContainsZeroFill = true; + } else { + S.SymbolInfos[Sym->getName()] = {Sym->getSymbolContent(), + Sym->getAddress()}; + SectionContainsContent = true; } } } - JITTargetAddress SecAddr = FirstAtom->getAddress(); - uint64_t SecSize = (LastAtom->getAddress() + LastAtom->getSize()) - - FirstAtom->getAddress(); + JITTargetAddress SecAddr = FirstSym->getAddress(); + uint64_t SecSize = + (LastSym->getBlock().getAddress() + LastSym->getBlock().getSize()) - + SecAddr; - if (Sec.isZeroFill()) + if (SectionContainsZeroFill && SectionContainsContent) + return make_error("Mixed zero-fill and content sections not " + "supported yet", + inconvertibleErrorCode()); + if (SectionContainsZeroFill) FileInfo.SectionInfos[Sec.getName()] = {SecSize, SecAddr}; else FileInfo.SectionInfos[Sec.getName()] = { - StringRef(FirstAtom->getContent().data(), SecSize), SecAddr}; + StringRef(FirstSym->getBlock().getContent().data(), SecSize), + SecAddr}; } return Error::success(); diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index dfee97241a9aa4..7edbea23a044f7 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -86,9 +86,9 @@ static cl::opt ShowAddrs( cl::desc("Print registered symbol, section, got and stub addresses"), cl::init(false)); -static cl::opt ShowAtomGraph( +static cl::opt ShowLinkGraph( "show-graph", - cl::desc("Print the atom graph after fixups have been applied"), + cl::desc("Print the link graph after fixups have been applied"), cl::init(false)); static cl::opt ShowSizes( @@ -151,17 +151,14 @@ operator<<(raw_ostream &OS, const Session::FileInfoMap &FIM) { return OS; } -static uint64_t computeTotalAtomSizes(AtomGraph &G) { +static uint64_t computeTotalBlockSizes(LinkGraph &G) { uint64_t TotalSize = 0; - for (auto *DA : G.defined_atoms()) - if (DA->isZeroFill()) - TotalSize += DA->getZeroFillSize(); - else - TotalSize += DA->getContent().size(); + for (auto *B : G.blocks()) + TotalSize += B->getSize(); return TotalSize; } -static void dumpSectionContents(raw_ostream &OS, AtomGraph &G) { +static void dumpSectionContents(raw_ostream &OS, LinkGraph &G) { constexpr JITTargetAddress DumpWidth = 16; static_assert(isPowerOf2_64(DumpWidth), "DumpWidth must be a power of two"); @@ -172,56 +169,55 @@ static void dumpSectionContents(raw_ostream &OS, AtomGraph &G) { std::sort(Sections.begin(), Sections.end(), [](const Section *LHS, const Section *RHS) { - if (LHS->atoms_empty() && RHS->atoms_empty()) + if (LHS->symbols_empty() && RHS->symbols_empty()) return false; - if (LHS->atoms_empty()) + if (LHS->symbols_empty()) return false; - if (RHS->atoms_empty()) + if (RHS->symbols_empty()) return true; - return (*LHS->atoms().begin())->getAddress() < - (*RHS->atoms().begin())->getAddress(); + SectionRange LHSRange(*LHS); + SectionRange RHSRange(*RHS); + return LHSRange.getStart() < RHSRange.getStart(); }); for (auto *S : Sections) { OS << S->getName() << " content:"; - if (S->atoms_empty()) { + if (S->symbols_empty()) { OS << "\n section empty\n"; continue; } - // Sort atoms into order, then render. - std::vector Atoms(S->atoms().begin(), S->atoms().end()); - std::sort(Atoms.begin(), Atoms.end(), - [](const DefinedAtom *LHS, const DefinedAtom *RHS) { - return LHS->getAddress() < RHS->getAddress(); - }); - - JITTargetAddress NextAddr = Atoms.front()->getAddress() & ~(DumpWidth - 1); - for (auto *DA : Atoms) { - bool IsZeroFill = DA->isZeroFill(); - JITTargetAddress AtomStart = DA->getAddress(); - JITTargetAddress AtomSize = - IsZeroFill ? DA->getZeroFillSize() : DA->getContent().size(); - JITTargetAddress AtomEnd = AtomStart + AtomSize; - const uint8_t *AtomData = - IsZeroFill ? nullptr : DA->getContent().bytes_begin(); - - // Pad any space before the atom starts. - while (NextAddr != AtomStart) { + // Sort symbols into order, then render. + std::vector Syms(S->symbols().begin(), S->symbols().end()); + llvm::sort(Syms, [](const Symbol *LHS, const Symbol *RHS) { + return LHS->getAddress() < RHS->getAddress(); + }); + + JITTargetAddress NextAddr = Syms.front()->getAddress() & ~(DumpWidth - 1); + for (auto *Sym : Syms) { + bool IsZeroFill = Sym->getBlock().isZeroFill(); + JITTargetAddress SymStart = Sym->getAddress(); + JITTargetAddress SymSize = Sym->getSize(); + JITTargetAddress SymEnd = SymStart + SymSize; + const uint8_t *SymData = + IsZeroFill ? nullptr : Sym->getSymbolContent().bytes_begin(); + + // Pad any space before the symbol starts. + while (NextAddr != SymStart) { if (NextAddr % DumpWidth == 0) OS << formatv("\n{0:x16}:", NextAddr); OS << " "; ++NextAddr; } - // Render the atom content. - while (NextAddr != AtomEnd) { + // Render the symbol content. + while (NextAddr != SymEnd) { if (NextAddr % DumpWidth == 0) OS << formatv("\n{0:x16}:", NextAddr); if (IsZeroFill) OS << " 00"; else - OS << formatv(" {0:x-2}", AtomData[NextAddr - AtomStart]); + OS << formatv(" {0:x-2}", SymData[NextAddr - SymStart]); ++NextAddr; } } @@ -291,18 +287,17 @@ class JITLinkSlabAllocator final : public JITLinkMemoryManager { for (auto &KV : Request) { auto &Seg = KV.second; - if (Seg.getContentAlignment() > PageSize) + if (Seg.getAlignment() > PageSize) return make_error("Cannot request higher than page " "alignment", inconvertibleErrorCode()); - if (PageSize % Seg.getContentAlignment() != 0) + if (PageSize % Seg.getAlignment() != 0) return make_error("Page size is not a multiple of " "alignment", inconvertibleErrorCode()); - uint64_t ZeroFillStart = - alignTo(Seg.getContentSize(), Seg.getZeroFillAlignment()); + uint64_t ZeroFillStart = Seg.getContentSize(); uint64_t SegmentSize = ZeroFillStart + Seg.getZeroFillSize(); // Round segment size up to page boundary. @@ -427,7 +422,7 @@ void Session::dumpSessionInfo(raw_ostream &OS) { void Session::modifyPassConfig(const Triple &FTT, PassConfiguration &PassConfig) { if (!CheckFiles.empty()) - PassConfig.PostFixupPasses.push_back([this](AtomGraph &G) { + PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) { if (TT.getObjectFormat() == Triple::MachO) return registerMachOStubsAndGOT(*this, G); return make_error("Unsupported object format for GOT/stub " @@ -435,27 +430,26 @@ void Session::modifyPassConfig(const Triple &FTT, inconvertibleErrorCode()); }); - if (ShowAtomGraph) - PassConfig.PostFixupPasses.push_back([](AtomGraph &G) -> Error { - outs() << "Atom graph post-fixup:\n"; + if (ShowLinkGraph) + PassConfig.PostFixupPasses.push_back([](LinkGraph &G) -> Error { + outs() << "Link graph post-fixup:\n"; G.dump(outs()); return Error::success(); }); - if (ShowSizes) { - PassConfig.PrePrunePasses.push_back([this](AtomGraph &G) -> Error { - SizeBeforePruning += computeTotalAtomSizes(G); - return Error::success(); - }); - PassConfig.PostFixupPasses.push_back([this](AtomGraph &G) -> Error { - SizeAfterFixups += computeTotalAtomSizes(G); - return Error::success(); - }); + PassConfig.PrePrunePasses.push_back([this](LinkGraph &G) -> Error { + SizeBeforePruning += computeTotalBlockSizes(G); + return Error::success(); + }); + PassConfig.PostFixupPasses.push_back([this](LinkGraph &G) -> Error { + SizeAfterFixups += computeTotalBlockSizes(G); + return Error::success(); + }); } if (ShowRelocatedSectionContents) - PassConfig.PostFixupPasses.push_back([](AtomGraph &G) -> Error { + PassConfig.PostFixupPasses.push_back([](LinkGraph &G) -> Error { outs() << "Relocated section contents for " << G.getName() << ":\n"; dumpSectionContents(outs(), G); return Error::success(); @@ -757,8 +751,8 @@ Error runChecks(Session &S) { static void dumpSessionStats(Session &S) { if (ShowSizes) - outs() << "Total size of all atoms before pruning: " << S.SizeBeforePruning - << "\nTotal size of all atoms after fixups: " << S.SizeAfterFixups + outs() << "Total size of all blocks before pruning: " << S.SizeBeforePruning + << "\nTotal size of all blocks after fixups: " << S.SizeAfterFixups << "\n"; } diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.h b/llvm/tools/llvm-jitlink/llvm-jitlink.h index 269597a29a30da..f94a50993c1226 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.h +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.h @@ -65,7 +65,7 @@ struct Session { uint64_t SizeAfterFixups = 0; }; -Error registerMachOStubsAndGOT(Session &S, jitlink::AtomGraph &G); +Error registerMachOStubsAndGOT(Session &S, jitlink::LinkGraph &G); } // end namespace llvm diff --git a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp index 28de0eb6efd32a..60f70d71349aa4 100644 --- a/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp +++ b/llvm/tools/llvm-objcopy/COFF/COFFObjcopy.cpp @@ -208,10 +208,11 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) { !Config.SymbolsToGlobalize.empty() || !Config.SymbolsToKeep.empty() || !Config.SymbolsToLocalize.empty() || !Config.SymbolsToWeaken.empty() || !Config.SymbolsToKeepGlobal.empty() || !Config.SectionsToRename.empty() || - !Config.SetSectionFlags.empty() || !Config.SymbolsToRename.empty() || - Config.ExtractDWO || Config.KeepFileSymbols || Config.LocalizeHidden || - Config.PreserveDates || Config.StripDWO || Config.StripNonAlloc || - Config.StripSections || Config.Weaken || Config.DecompressDebugSections || + !Config.SetSectionAlignment.empty() || !Config.SetSectionFlags.empty() || + !Config.SymbolsToRename.empty() || Config.ExtractDWO || + Config.KeepFileSymbols || Config.LocalizeHidden || Config.PreserveDates || + Config.StripDWO || Config.StripNonAlloc || Config.StripSections || + Config.Weaken || Config.DecompressDebugSections || Config.DiscardMode == DiscardType::Locals || !Config.SymbolsToAdd.empty() || Config.EntryExpr) { return createStringError(llvm::errc::invalid_argument, diff --git a/llvm/tools/llvm-objcopy/CopyConfig.cpp b/llvm/tools/llvm-objcopy/CopyConfig.cpp index 99291660a49a6e..9b51b745d50e13 100644 --- a/llvm/tools/llvm-objcopy/CopyConfig.cpp +++ b/llvm/tools/llvm-objcopy/CopyConfig.cpp @@ -155,6 +155,25 @@ static Expected parseRenameSectionValue(StringRef FlagValue) { return SR; } +static Expected> +parseSetSectionAlignment(StringRef FlagValue) { + if (!FlagValue.contains('=')) + return createStringError( + errc::invalid_argument, + "bad format for --set-section-alignment: missing '='"); + auto Split = StringRef(FlagValue).split('='); + if (Split.first.empty()) + return createStringError( + errc::invalid_argument, + "bad format for --set-section-alignment: missing section name"); + uint64_t NewAlign; + if (Split.second.getAsInteger(0, NewAlign)) + return createStringError(errc::invalid_argument, + "invalid alignment for --set-section-alignment: '%s'", + Split.second.str().c_str()); + return std::make_pair(Split.first, NewAlign); +} + static Expected parseSetSectionFlagValue(StringRef FlagValue) { if (!StringRef(FlagValue).contains('=')) @@ -489,6 +508,13 @@ Expected parseObjcopyOptions(ArrayRef ArgsArr) { "multiple renames of section '%s'", SR->OriginalName.str().c_str()); } + for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_alignment)) { + Expected> NameAndAlign = + parseSetSectionAlignment(Arg->getValue()); + if (!NameAndAlign) + return NameAndAlign.takeError(); + Config.SetSectionAlignment[NameAndAlign->first] = NameAndAlign->second; + } for (auto Arg : InputArgs.filtered(OBJCOPY_set_section_flags)) { Expected SFU = parseSetSectionFlagValue(Arg->getValue()); diff --git a/llvm/tools/llvm-objcopy/CopyConfig.h b/llvm/tools/llvm-objcopy/CopyConfig.h index 85d660407f0ecd..745af0ce480f04 100644 --- a/llvm/tools/llvm-objcopy/CopyConfig.h +++ b/llvm/tools/llvm-objcopy/CopyConfig.h @@ -161,6 +161,7 @@ struct CopyConfig { // Map options StringMap SectionsToRename; + StringMap SetSectionAlignment; StringMap SetSectionFlags; StringMap SymbolsToRename; diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp index 2f29d90f5be31b..dd6a7d7e14b86c 100644 --- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp +++ b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp @@ -670,6 +670,14 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj, } } + if (!Config.SetSectionAlignment.empty()) { + for (SectionBase &Sec : Obj.sections()) { + auto I = Config.SetSectionAlignment.find(Sec.Name); + if (I != Config.SetSectionAlignment.end()) + Sec.Align = I->second; + } + } + if (!Config.SetSectionFlags.empty()) { for (auto &Sec : Obj.sections()) { const auto Iter = Config.SetSectionFlags.find(Sec.Name); diff --git a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp index a52931e469fd3a..6d586e7d73f108 100644 --- a/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp +++ b/llvm/tools/llvm-objcopy/MachO/MachOObjcopy.cpp @@ -31,13 +31,14 @@ static Error handleArgs(const CopyConfig &Config, Object &Obj) { !Config.SymbolsToKeepGlobal.empty() || !Config.SectionsToRename.empty() || !Config.SymbolsToRename.empty() || !Config.UnneededSymbolsToRemove.empty() || - !Config.SetSectionFlags.empty() || !Config.ToRemove.empty() || - Config.ExtractDWO || Config.KeepFileSymbols || Config.LocalizeHidden || - Config.PreserveDates || Config.StripDWO || Config.StripNonAlloc || - Config.StripSections || Config.Weaken || Config.DecompressDebugSections || - Config.StripDebug || Config.StripNonAlloc || Config.StripSections || - Config.StripUnneeded || Config.DiscardMode != DiscardType::None || - !Config.SymbolsToAdd.empty() || Config.EntryExpr) { + !Config.SetSectionAlignment.empty() || !Config.SetSectionFlags.empty() || + !Config.ToRemove.empty() || Config.ExtractDWO || Config.KeepFileSymbols || + Config.LocalizeHidden || Config.PreserveDates || Config.StripDWO || + Config.StripNonAlloc || Config.StripSections || Config.Weaken || + Config.DecompressDebugSections || Config.StripDebug || + Config.StripNonAlloc || Config.StripSections || Config.StripUnneeded || + Config.DiscardMode != DiscardType::None || !Config.SymbolsToAdd.empty() || + Config.EntryExpr) { return createStringError(llvm::errc::invalid_argument, "option not supported by llvm-objcopy for MachO"); } diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td index 87b678a8b169f9..9e6b6f0005cd1f 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td +++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td @@ -75,6 +75,10 @@ defm add_section "Make a section named
with the contents of .">, MetaVarName<"section=file">; +defm set_section_alignment + : Eq<"set-section-alignment", "Set alignment for a given section.">, + MetaVarName<"section=align">; + defm set_section_flags : Eq<"set-section-flags", "Set section flags for a given section. Flags supported for GNU " diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index f50d3448df398b..8b0a04c61049de 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -2076,27 +2076,37 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr, if (StartAddress.getNumOccurrences() || StopAddress.getNumOccurrences()) checkForInvalidStartStopAddress(O, StartAddress, StopAddress); + // Note: the order here matches GNU objdump for compatability. StringRef ArchiveName = A ? A->getFileName() : ""; - if (FileHeaders) - printFileHeaders(O); if (ArchiveHeaders && !MachOOpt && C) printArchiveChild(ArchiveName, *C); - if (Disassemble) - disassembleObject(O, Relocations); - if (Relocations && !Disassemble) - printRelocations(O); - if (DynamicRelocations) - printDynamicRelocations(O); + if (FileHeaders) + printFileHeaders(O); if (PrivateHeaders || FirstPrivateHeader) printPrivateFileHeaders(O, FirstPrivateHeader); if (SectionHeaders) printSectionHeaders(O); - if (SectionContents) - printSectionContents(O); if (SymbolTable) printSymbolTable(O, ArchiveName); + if (DwarfDumpType != DIDT_Null) { + std::unique_ptr DICtx = DWARFContext::create(*O); + // Dump the complete DWARF structure. + DIDumpOptions DumpOpts; + DumpOpts.DumpType = DwarfDumpType; + DICtx->dump(outs(), DumpOpts); + } + if (Relocations && !Disassemble) + printRelocations(O); + if (DynamicRelocations) + printDynamicRelocations(O); + if (SectionContents) + printSectionContents(O); + if (Disassemble) + disassembleObject(O, Relocations); if (UnwindInfo) printUnwindInfo(O); + + // Mach-O specific options: if (ExportsTrie) printExportsTrie(O); if (Rebase) @@ -2107,17 +2117,12 @@ static void dumpObject(ObjectFile *O, const Archive *A = nullptr, printLazyBindTable(O); if (WeakBind) printWeakBindTable(O); + + // Other special sections: if (RawClangAST) printRawClangAST(O); if (FaultMapSection) printFaultMaps(O); - if (DwarfDumpType != DIDT_Null) { - std::unique_ptr DICtx = DWARFContext::create(*O); - // Dump the complete DWARF structure. - DIDumpOptions DumpOpts; - DumpOpts.DumpType = DwarfDumpType; - DICtx->dump(outs(), DumpOpts); - } } static void dumpObject(const COFFImportFile *I, const Archive *A, diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index e311c1069f7228..1470442c38b618 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -439,12 +439,35 @@ static void populateProfileSymbolList(MemoryBuffer *Buffer, PSL.add(symbol); } +static void handleExtBinaryWriter(sampleprof::SampleProfileWriter &Writer, + ProfileFormat OutputFormat, + MemoryBuffer *Buffer, + sampleprof::ProfileSymbolList &WriterList, + bool CompressAllSections) { + populateProfileSymbolList(Buffer, WriterList); + if (WriterList.size() > 0 && OutputFormat != PF_Ext_Binary) + warn("Profile Symbol list is not empty but the output format is not " + "ExtBinary format. The list will be lost in the output. "); + + Writer.setProfileSymbolList(&WriterList); + + if (CompressAllSections) { + if (OutputFormat != PF_Ext_Binary) { + warn("-compress-all-section is ignored. Specify -extbinary to enable it"); + } else { + auto ExtBinaryWriter = + static_cast(&Writer); + ExtBinaryWriter->setToCompressAllSections(); + } + } +} + static void mergeSampleProfile(const WeightedFileVector &Inputs, SymbolRemapper *Remapper, StringRef OutputFilename, ProfileFormat OutputFormat, StringRef ProfileSymbolListFile, - bool CompressProfSymList, FailureMode FailMode) { + bool CompressAllSections, FailureMode FailMode) { using namespace sampleprof; StringMap ProfileMap; SmallVector, 5> Readers; @@ -496,17 +519,12 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs, if (std::error_code EC = WriterOrErr.getError()) exitWithErrorCode(EC, OutputFilename); + auto Writer = std::move(WriterOrErr.get()); // WriterList will have StringRef refering to string in Buffer. // Make sure Buffer lives as long as WriterList. auto Buffer = getInputFileBuf(ProfileSymbolListFile); - populateProfileSymbolList(Buffer.get(), WriterList); - WriterList.setToCompress(CompressProfSymList); - if (WriterList.size() > 0 && OutputFormat != PF_Ext_Binary) - warn("Profile Symbol list is not empty but the output format is not " - "ExtBinary format. The list will be lost in the output. "); - - auto Writer = std::move(WriterOrErr.get()); - Writer->setProfileSymbolList(&WriterList); + handleExtBinaryWriter(*Writer, OutputFormat, Buffer.get(), WriterList, + CompressAllSections); Writer->write(ProfileMap); } @@ -630,9 +648,10 @@ static int merge_main(int argc, const char *argv[]) { "prof-sym-list", cl::init(""), cl::desc("Path to file containing the list of function symbols " "used to populate profile symbol list")); - cl::opt CompressProfSymList( - "compress-prof-sym-list", cl::init(false), cl::Hidden, - cl::desc("Compress profile symbol list before write it into profile. ")); + cl::opt CompressAllSections( + "compress-all-sections", cl::init(false), cl::Hidden, + cl::desc("Compress all sections when writing the profile (only " + "meaningful for -extbinary)")); cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n"); @@ -666,8 +685,8 @@ static int merge_main(int argc, const char *argv[]) { OutputFormat, OutputSparse, NumThreads, FailureMode); else mergeSampleProfile(WeightedInputs, Remapper.get(), OutputFilename, - OutputFormat, ProfileSymbolListFile, - CompressProfSymList, FailureMode); + OutputFormat, ProfileSymbolListFile, CompressAllSections, + FailureMode); return 0; } diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index af3d0e967d10d2..4e9cf213174f4d 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -173,11 +173,7 @@ template class ELFDumper : public ObjDumper { void printVersionInfo() override; void printGroupSections() override; - void printAttributes() override; - void printMipsPLTGOT() override; - void printMipsABIFlags() override; - void printMipsReginfo() override; - void printMipsOptions() override; + void printArchSpecificInfo() override; void printStackMap() const override; @@ -218,6 +214,10 @@ template class ELFDumper : public ObjDumper { S->sh_entsize, ObjF->getFileName()}); } + void printAttributes(); + void printMipsReginfo(); + void printMipsOptions(); + std::pair findDynamic(const ELFFile *Obj); void loadDynamicTable(const ELFFile *Obj); @@ -302,7 +302,7 @@ template class ELFDumper : public ObjDumper { void getSectionNameIndex(const Elf_Sym *Symbol, const Elf_Sym *FirstSym, StringRef &SectionName, unsigned &SectionIndex) const; - std::string getStaticSymbolName(uint32_t Index) const; + Expected getStaticSymbolName(uint32_t Index) const; std::string getDynamicString(uint64_t Value) const; StringRef getSymbolVersionByIndex(StringRef StrTab, uint32_t VersionSymbolIndex, @@ -429,6 +429,7 @@ template class DumpStyle { virtual void printStackSizeEntry(uint64_t Size, StringRef FuncName) = 0; virtual void printMipsGOT(const MipsGOTParser &Parser) = 0; virtual void printMipsPLT(const MipsGOTParser &Parser) = 0; + virtual void printMipsABIFlags(const ELFObjectFile *Obj) = 0; const ELFDumper *dumper() const { return Dumper; } protected: @@ -480,6 +481,7 @@ template class GNUStyle : public DumpStyle { void printStackSizeEntry(uint64_t Size, StringRef FuncName) override; void printMipsGOT(const MipsGOTParser &Parser) override; void printMipsPLT(const MipsGOTParser &Parser) override; + void printMipsABIFlags(const ELFObjectFile *Obj) override; private: struct Field { @@ -586,6 +588,7 @@ template class LLVMStyle : public DumpStyle { void printStackSizeEntry(uint64_t Size, StringRef FuncName) override; void printMipsGOT(const MipsGOTParser &Parser) override; void printMipsPLT(const MipsGOTParser &Parser) override; + void printMipsABIFlags(const ELFObjectFile *Obj) override; private: void printRelocation(const ELFO *Obj, Elf_Rela Rel, const Elf_Shdr *SymTab); @@ -751,17 +754,22 @@ static std::string maybeDemangle(StringRef Name) { } template -std::string ELFDumper::getStaticSymbolName(uint32_t Index) const { +Expected +ELFDumper::getStaticSymbolName(uint32_t Index) const { const ELFFile *Obj = ObjF->getELFFile(); - StringRef StrTable = unwrapOrError( - ObjF->getFileName(), Obj->getStringTableForSymtab(*DotSymtabSec)); - Elf_Sym_Range Syms = - unwrapOrError(ObjF->getFileName(), Obj->symbols(DotSymtabSec)); - if (Index >= Syms.size()) - reportError(createError("Invalid symbol index"), ObjF->getFileName()); - const Elf_Sym *Sym = &Syms[Index]; - return maybeDemangle( - unwrapOrError(ObjF->getFileName(), Sym->getName(StrTable))); + Expected SymOrErr = + Obj->getSymbol(DotSymtabSec, Index); + if (!SymOrErr) + return SymOrErr.takeError(); + + Expected StrTabOrErr = Obj->getStringTableForSymtab(*DotSymtabSec); + if (!StrTabOrErr) + return StrTabOrErr.takeError(); + + Expected NameOrErr = (*SymOrErr)->getName(*StrTabOrErr); + if (!NameOrErr) + return NameOrErr.takeError(); + return maybeDemangle(*NameOrErr); } template @@ -2210,6 +2218,30 @@ template void ELFDumper::printLoadName() { W.printString("LoadName", SOName); } +template void ELFDumper::printArchSpecificInfo() { + const ELFFile *Obj = ObjF->getELFFile(); + switch (Obj->getHeader()->e_machine) { + case EM_ARM: + printAttributes(); + break; + case EM_MIPS: { + ELFDumperStyle->printMipsABIFlags(ObjF); + printMipsOptions(); + printMipsReginfo(); + + MipsGOTParser Parser(Obj, ObjF->getFileName(), dynamic_table(), + dynamic_symbols()); + if (Parser.hasGot()) + ELFDumperStyle->printMipsGOT(Parser); + if (Parser.hasPlt()) + ELFDumperStyle->printMipsPLT(Parser); + break; + } + default: + break; + } +} + template void ELFDumper::printAttributes() { W.startLine() << "Attributes not implemented.\n"; } @@ -2315,7 +2347,7 @@ MipsGOTParser::MipsGOTParser(const ELFO *Obj, StringRef FileName, if (IsStatic) { GotSec = findSectionByName(*Obj, FileName, ".got"); if (!GotSec) - reportError(createError("Cannot find .got section"), FileName); + return; ArrayRef Content = unwrapOrError(FileName, Obj->getSectionContents(GotSec)); @@ -2517,20 +2549,6 @@ MipsGOTParser::getPltSym(const Entry *E) const { } } -template void ELFDumper::printMipsPLTGOT() { - const ELFFile *Obj = ObjF->getELFFile(); - if (Obj->getHeader()->e_machine != EM_MIPS) - reportError(createError("MIPS PLT GOT is available for MIPS targets only"), - ObjF->getFileName()); - - MipsGOTParser Parser(Obj, ObjF->getFileName(), dynamic_table(), - dynamic_symbols()); - if (Parser.hasGot()) - ELFDumperStyle->printMipsGOT(Parser); - if (Parser.hasPlt()) - ELFDumperStyle->printMipsPLT(Parser); -} - static const EnumEntry ElfMipsISAExtType[] = { {"None", Mips::AFL_EXT_NONE}, {"Broadcom SB-1", Mips::AFL_EXT_SB1}, @@ -2604,43 +2622,6 @@ static int getMipsRegisterSize(uint8_t Flag) { } } -template void ELFDumper::printMipsABIFlags() { - const ELFFile *Obj = ObjF->getELFFile(); - const Elf_Shdr *Shdr = - findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.abiflags"); - if (!Shdr) { - W.startLine() << "There is no .MIPS.abiflags section in the file.\n"; - return; - } - ArrayRef Sec = - unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr)); - if (Sec.size() != sizeof(Elf_Mips_ABIFlags)) { - W.startLine() << "The .MIPS.abiflags section has a wrong size.\n"; - return; - } - - auto *Flags = reinterpret_cast *>(Sec.data()); - - raw_ostream &OS = W.getOStream(); - DictScope GS(W, "MIPS ABI Flags"); - - W.printNumber("Version", Flags->version); - W.startLine() << "ISA: "; - if (Flags->isa_rev <= 1) - OS << format("MIPS%u", Flags->isa_level); - else - OS << format("MIPS%ur%u", Flags->isa_level, Flags->isa_rev); - OS << "\n"; - W.printEnum("ISA Extension", Flags->isa_ext, makeArrayRef(ElfMipsISAExtType)); - W.printFlags("ASEs", Flags->ases, makeArrayRef(ElfMipsASEFlags)); - W.printEnum("FP ABI", Flags->fp_abi, makeArrayRef(ElfMipsFpABIType)); - W.printNumber("GPR size", getMipsRegisterSize(Flags->gpr_size)); - W.printNumber("CPR1 size", getMipsRegisterSize(Flags->cpr1_size)); - W.printNumber("CPR2 size", getMipsRegisterSize(Flags->cpr2_size)); - W.printFlags("Flags 1", Flags->flags1, makeArrayRef(ElfMipsFlags1)); - W.printHex("Flags 2", Flags->flags2); -} - template static void printMipsReginfoData(ScopedPrinter &W, const Elf_Mips_RegInfo &Reginfo) { @@ -3437,10 +3418,21 @@ template void GNUStyle::printHashSymbols(const ELFO *Obj) { for (uint32_t Buc = 0; Buc < SysVHash->nbucket; Buc++) { if (Buckets[Buc] == ELF::STN_UNDEF) continue; + std::vector Visited(SysVHash->nchain); for (uint32_t Ch = Buckets[Buc]; Ch < SysVHash->nchain; Ch = Chains[Ch]) { if (Ch == ELF::STN_UNDEF) break; + + if (Visited[Ch]) { + reportWarning( + createError(".hash section is invalid: bucket " + Twine(Ch) + + ": a cycle was detected in the linked chain"), + this->FileName); + break; + } + printHashedSymbol(Obj, &DynSyms[0], Ch, StringTable, Buc); + Visited[Ch] = true; } } } @@ -4060,7 +4052,7 @@ void GNUStyle::printCGProfile(const ELFFile *Obj) { template void GNUStyle::printAddrsig(const ELFFile *Obj) { - OS << "GNUStyle::printAddrsig not implemented\n"; + reportError(createError("--addrsig: not implemented"), this->FileName); } static StringRef getGenericNoteTypeName(const uint32_t NT) { @@ -5091,6 +5083,45 @@ void GNUStyle::printMipsPLT(const MipsGOTParser &Parser) { } } +template +void GNUStyle::printMipsABIFlags(const ELFObjectFile *ObjF) { + const ELFFile *Obj = ObjF->getELFFile(); + const Elf_Shdr *Shdr = + findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.abiflags"); + if (!Shdr) + return; + + ArrayRef Sec = + unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr)); + if (Sec.size() != sizeof(Elf_Mips_ABIFlags)) + reportError(createError(".MIPS.abiflags section has a wrong size"), + ObjF->getFileName()); + + auto *Flags = reinterpret_cast *>(Sec.data()); + + OS << "MIPS ABI Flags Version: " << Flags->version << "\n\n"; + OS << "ISA: MIPS" << int(Flags->isa_level); + if (Flags->isa_rev > 1) + OS << "r" << int(Flags->isa_rev); + OS << "\n"; + OS << "GPR size: " << getMipsRegisterSize(Flags->gpr_size) << "\n"; + OS << "CPR1 size: " << getMipsRegisterSize(Flags->cpr1_size) << "\n"; + OS << "CPR2 size: " << getMipsRegisterSize(Flags->cpr2_size) << "\n"; + OS << "FP ABI: " << printEnum(Flags->fp_abi, makeArrayRef(ElfMipsFpABIType)) + << "\n"; + OS << "ISA Extension: " + << printEnum(Flags->isa_ext, makeArrayRef(ElfMipsISAExtType)) << "\n"; + if (Flags->ases == 0) + OS << "ASEs: None\n"; + else + // FIXME: Print each flag on a separate line. + OS << "ASEs: " << printFlags(Flags->ases, makeArrayRef(ElfMipsASEFlags)) + << "\n"; + OS << "FLAGS 1: " << format_hex_no_prefix(Flags->flags1, 8, false) << "\n"; + OS << "FLAGS 2: " << format_hex_no_prefix(Flags->flags2, 8, false) << "\n"; + OS << "\n"; +} + template void LLVMStyle::printFileHeaders(const ELFO *Obj) { const Elf_Ehdr *E = Obj->getHeader(); { @@ -5697,14 +5728,35 @@ void LLVMStyle::printCGProfile(const ELFFile *Obj) { this->dumper()->getDotCGProfileSec())); for (const Elf_CGProfile &CGPE : CGProfile) { DictScope D(W, "CGProfileEntry"); - W.printNumber("From", this->dumper()->getStaticSymbolName(CGPE.cgp_from), - CGPE.cgp_from); - W.printNumber("To", this->dumper()->getStaticSymbolName(CGPE.cgp_to), - CGPE.cgp_to); + W.printNumber( + "From", + unwrapOrError(this->FileName, + this->dumper()->getStaticSymbolName(CGPE.cgp_from)), + CGPE.cgp_from); + W.printNumber( + "To", + unwrapOrError(this->FileName, + this->dumper()->getStaticSymbolName(CGPE.cgp_to)), + CGPE.cgp_to); W.printNumber("Weight", CGPE.cgp_weight); } } +static Expected> toULEB128Array(ArrayRef Data) { + std::vector Ret; + const uint8_t *Cur = Data.begin(); + const uint8_t *End = Data.end(); + while (Cur != End) { + unsigned Size; + const char *Err; + Ret.push_back(decodeULEB128(Cur, &Size, End, &Err)); + if (Err) + return createError(Err); + Cur += Size; + } + return Ret; +} + template void LLVMStyle::printAddrsig(const ELFFile *Obj) { ListScope L(W, "Addrsig"); @@ -5713,18 +5765,20 @@ void LLVMStyle::printAddrsig(const ELFFile *Obj) { ArrayRef Contents = unwrapOrError( this->FileName, Obj->getSectionContents(this->dumper()->getDotAddrsigSec())); - const uint8_t *Cur = Contents.begin(); - const uint8_t *End = Contents.end(); - while (Cur != End) { - unsigned Size; - const char *Err; - uint64_t SymIndex = decodeULEB128(Cur, &Size, End, &Err); - if (Err) - reportError(createError(Err), this->FileName); + Expected> V = toULEB128Array(Contents); + if (!V) { + reportWarning(V.takeError(), this->FileName); + return; + } - W.printNumber("Sym", this->dumper()->getStaticSymbolName(SymIndex), - SymIndex); - Cur += Size; + for (uint64_t Sym : *V) { + Expected NameOrErr = this->dumper()->getStaticSymbolName(Sym); + if (NameOrErr) { + W.printNumber("Sym", *NameOrErr, Sym); + continue; + } + reportWarning(NameOrErr.takeError(), this->FileName); + W.printNumber("Sym", "", Sym); } } @@ -6013,3 +6067,41 @@ void LLVMStyle::printMipsPLT(const MipsGOTParser &Parser) { } } } + +template +void LLVMStyle::printMipsABIFlags(const ELFObjectFile *ObjF) { + const ELFFile *Obj = ObjF->getELFFile(); + const Elf_Shdr *Shdr = + findSectionByName(*Obj, ObjF->getFileName(), ".MIPS.abiflags"); + if (!Shdr) { + W.startLine() << "There is no .MIPS.abiflags section in the file.\n"; + return; + } + ArrayRef Sec = + unwrapOrError(ObjF->getFileName(), Obj->getSectionContents(Shdr)); + if (Sec.size() != sizeof(Elf_Mips_ABIFlags)) { + W.startLine() << "The .MIPS.abiflags section has a wrong size.\n"; + return; + } + + auto *Flags = reinterpret_cast *>(Sec.data()); + + raw_ostream &OS = W.getOStream(); + DictScope GS(W, "MIPS ABI Flags"); + + W.printNumber("Version", Flags->version); + W.startLine() << "ISA: "; + if (Flags->isa_rev <= 1) + OS << format("MIPS%u", Flags->isa_level); + else + OS << format("MIPS%ur%u", Flags->isa_level, Flags->isa_rev); + OS << "\n"; + W.printEnum("ISA Extension", Flags->isa_ext, makeArrayRef(ElfMipsISAExtType)); + W.printFlags("ASEs", Flags->ases, makeArrayRef(ElfMipsASEFlags)); + W.printEnum("FP ABI", Flags->fp_abi, makeArrayRef(ElfMipsFpABIType)); + W.printNumber("GPR size", getMipsRegisterSize(Flags->gpr_size)); + W.printNumber("CPR1 size", getMipsRegisterSize(Flags->cpr1_size)); + W.printNumber("CPR2 size", getMipsRegisterSize(Flags->cpr2_size)); + W.printFlags("Flags 1", Flags->flags1, makeArrayRef(ElfMipsFlags1)); + W.printHex("Flags 2", Flags->flags2); +} diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h index e2c7e833320d23..2ba441342499c0 100644 --- a/llvm/tools/llvm-readobj/ObjDumper.h +++ b/llvm/tools/llvm-readobj/ObjDumper.h @@ -69,15 +69,7 @@ class ObjDumper { virtual void printNotes() {} virtual void printELFLinkerOptions() {} virtual void printStackSizes() {} - - // Only implemented for ARM ELF at this time. - virtual void printAttributes() { } - - // Only implemented for MIPS ELF at this time. - virtual void printMipsPLTGOT() { } - virtual void printMipsABIFlags() { } - virtual void printMipsReginfo() { } - virtual void printMipsOptions() { } + virtual void printArchSpecificInfo() { } // Only implemented for PE/COFF. virtual void printCOFFImports() { } diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp index 5919a7eed3e3a7..4db13897879d11 100644 --- a/llvm/tools/llvm-readobj/llvm-readobj.cpp +++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp @@ -237,23 +237,6 @@ namespace opts { cl::alias ArchSpecifcInfoShort("A", cl::desc("Alias for --arch-specific"), cl::aliasopt(ArchSpecificInfo), cl::NotHidden); - // --mips-plt-got - cl::opt - MipsPLTGOT("mips-plt-got", - cl::desc("Display the MIPS GOT and PLT GOT sections")); - - // --mips-abi-flags - cl::opt MipsABIFlags("mips-abi-flags", - cl::desc("Display the MIPS.abiflags section")); - - // --mips-reginfo - cl::opt MipsReginfo("mips-reginfo", - cl::desc("Display the MIPS .reginfo section")); - - // --mips-options - cl::opt MipsOptions("mips-options", - cl::desc("Display the MIPS .MIPS.options section")); - // --coff-imports cl::opt COFFImports("coff-imports", cl::desc("Display the PE/COFF import table")); @@ -414,17 +397,6 @@ void reportWarning(Error Err, StringRef Input) { } // namespace llvm -static bool isMipsArch(unsigned Arch) { - switch (Arch) { - case llvm::Triple::mips: - case llvm::Triple::mipsel: - case llvm::Triple::mips64: - case llvm::Triple::mips64el: - return true; - default: - return false; - } -} namespace { struct ReadObjTypeTableBuilder { ReadObjTypeTableBuilder() @@ -520,18 +492,7 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer, if (opts::ELFLinkerOptions) Dumper->printELFLinkerOptions(); if (opts::ArchSpecificInfo) - if (Obj->getArch() == llvm::Triple::arm) - Dumper->printAttributes(); - if (isMipsArch(Obj->getArch())) { - if (opts::MipsPLTGOT) - Dumper->printMipsPLTGOT(); - if (opts::MipsABIFlags) - Dumper->printMipsABIFlags(); - if (opts::MipsReginfo) - Dumper->printMipsReginfo(); - if (opts::MipsOptions) - Dumper->printMipsOptions(); - } + Dumper->printArchSpecificInfo(); if (opts::SectionGroups) Dumper->printGroupSections(); if (opts::HashHistogram) @@ -730,8 +691,10 @@ int main(int argc, const char *argv[]) { opts::UnwindInfo = true; opts::SectionGroups = true; opts::HashHistogram = true; - if (opts::Output == opts::LLVM) + if (opts::Output == opts::LLVM) { + opts::Addrsig = true; opts::PrintStackSizes = true; + } } if (opts::Headers) { diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp index c4b6eb79d18c08..2c17b9570e1ba5 100644 --- a/llvm/tools/obj2yaml/elf2yaml.cpp +++ b/llvm/tools/obj2yaml/elf2yaml.cpp @@ -41,6 +41,7 @@ class ELFDumper { Expected getUniquedSymbolName(const Elf_Sym *Sym, StringRef StrTable, const Elf_Shdr *SymTab); + Expected getSymbolName(uint32_t SymtabNdx, uint32_t SymbolNdx); const object::ELFFile &Obj; ArrayRef ShndxTable; @@ -56,6 +57,7 @@ class ELFDumper { Error dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, ELFYAML::Relocation &R); + Expected dumpAddrsigSection(const Elf_Shdr *Shdr); Expected dumpDynamicSection(const Elf_Shdr *Shdr); Expected dumpRelocSection(const Elf_Shdr *Shdr); Expected @@ -284,6 +286,13 @@ template Expected ELFDumper::dump() { Y->Sections.emplace_back(*SecOrErr); break; } + case ELF::SHT_LLVM_ADDRSIG: { + Expected SecOrErr = dumpAddrsigSection(&Sec); + if (!SecOrErr) + return SecOrErr.takeError(); + Y->Sections.emplace_back(*SecOrErr); + break; + } case ELF::SHT_NULL: { // We only dump the SHT_NULL section at index 0 when it // has at least one non-null field, because yaml2obj @@ -519,6 +528,46 @@ ELFDumper::dumpStackSizesSection(const Elf_Shdr *Shdr) { return S.release(); } +template +Expected +ELFDumper::dumpAddrsigSection(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); + + auto ContentOrErr = Obj.getSectionContents(Shdr); + if (!ContentOrErr) + return ContentOrErr.takeError(); + + ArrayRef Content = *ContentOrErr; + DataExtractor::Cursor Cur(0); + DataExtractor Data(Content, Obj.isLE(), /*AddressSize=*/0); + std::vector Symbols; + while (Cur && Cur.tell() < Content.size()) { + uint64_t SymNdx = Data.getULEB128(Cur); + if (!Cur) + break; + + Expected SymbolName = getSymbolName(Shdr->sh_link, SymNdx); + if (!SymbolName || SymbolName->empty()) { + consumeError(SymbolName.takeError()); + Symbols.emplace_back(SymNdx); + continue; + } + + Symbols.emplace_back(*SymbolName); + } + + if (Cur) { + S->Symbols = std::move(Symbols); + return S.release(); + } + + consumeError(Cur.takeError()); + S->Content = yaml::BinaryRef(Content); + return S.release(); +} + template Expected ELFDumper::dumpDynamicSection(const Elf_Shdr *Shdr) { @@ -791,25 +840,31 @@ ELFDumper::dumpVerneedSection(const Elf_Shdr *Shdr) { } template -Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { - auto S = std::make_unique(); - if (Error E = dumpCommonSection(Shdr, *S)) - return std::move(E); - - auto SymtabOrErr = Obj.getSection(Shdr->sh_link); +Expected ELFDumper::getSymbolName(uint32_t SymtabNdx, + uint32_t SymbolNdx) { + auto SymtabOrErr = Obj.getSection(SymtabNdx); if (!SymtabOrErr) return SymtabOrErr.takeError(); - // Get symbol with index sh_info which name is the signature of the group. + const Elf_Shdr *Symtab = *SymtabOrErr; - auto SymOrErr = Obj.getSymbol(Symtab, Shdr->sh_info); + auto SymOrErr = Obj.getSymbol(Symtab, SymbolNdx); if (!SymOrErr) return SymOrErr.takeError(); + auto StrTabOrErr = Obj.getStringTableForSymtab(*Symtab); if (!StrTabOrErr) return StrTabOrErr.takeError(); + return getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); +} + +template +Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); - Expected SymbolName = - getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); + // Get symbol with index sh_info. This symbol's name is the signature of the group. + Expected SymbolName = getSymbolName(Shdr->sh_link, Shdr->sh_info); if (!SymbolName) return SymbolName.takeError(); S->Signature = *SymbolName; diff --git a/llvm/tools/opt-viewer/CMakeLists.txt b/llvm/tools/opt-viewer/CMakeLists.txt index 19b60693308201..ead73ec13a8f5d 100644 --- a/llvm/tools/opt-viewer/CMakeLists.txt +++ b/llvm/tools/opt-viewer/CMakeLists.txt @@ -11,3 +11,10 @@ foreach (file ${files}) DESTINATION share/opt-viewer COMPONENT opt-viewer) endforeach (file) + +add_custom_target(opt-viewer DEPENDS ${files}) +if(NOT LLVM_ENABLE_IDE) + add_llvm_install_targets("install-opt-viewer" + DEPENDS opt-viewer + COMPONENT opt-viewer) +endif() diff --git a/llvm/unittests/Analysis/LoopInfoTest.cpp b/llvm/unittests/Analysis/LoopInfoTest.cpp index 2528078ac67b74..5504ac11240b7d 100644 --- a/llvm/unittests/Analysis/LoopInfoTest.cpp +++ b/llvm/unittests/Analysis/LoopInfoTest.cpp @@ -1272,6 +1272,91 @@ TEST(LoopInfoTest, AuxiliaryIV) { }); } +TEST(LoopInfoTest, LoopNotInSimplifyForm) { + const char *ModuleStr = + "define void @foo(i32 %n) {\n" + "entry:\n" + " %guard.cmp = icmp sgt i32 %n, 0\n" + " br i1 %guard.cmp, label %for.cond, label %for.end\n" + "for.cond:\n" + " %i.0 = phi i32 [ 0, %entry ], [ %inc, %latch.1 ], [ %inc, %latch.2 ]\n" + " %inc = add nsw i32 %i.0, 1\n" + " %cmp = icmp slt i32 %i.0, %n\n" + " br i1 %cmp, label %latch.1, label %for.end\n" + "latch.1:\n" + " br i1 undef, label %for.cond, label %latch.2\n" + "latch.2:\n" + " br label %for.cond\n" + "for.end:\n" + " ret void\n" + "}\n"; + + // Parse the module. + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleStr); + + runWithLoopInfo(*M, "foo", [&](Function &F, LoopInfo &LI) { + Function::iterator FI = F.begin(); + // First basic block is entry - skip it. + BasicBlock *Header = &*(++FI); + assert(Header && "No header"); + Loop *L = LI.getLoopFor(Header); + EXPECT_NE(L, nullptr); + EXPECT_FALSE(L->isLoopSimplifyForm()); + // No loop guard because loop in not in simplify form. + EXPECT_EQ(L->getLoopGuardBranch(), nullptr); + EXPECT_FALSE(L->isGuarded()); + }); +} + +TEST(LoopInfoTest, LoopLatchNotExiting) { + const char *ModuleStr = + "define void @foo(i32* %A, i32 %ub) {\n" + "entry:\n" + " %guardcmp = icmp slt i32 0, %ub\n" + " br i1 %guardcmp, label %for.preheader, label %for.end\n" + "for.preheader:\n" + " br label %for.body\n" + "for.body:\n" + " %i = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]\n" + " %idxprom = sext i32 %i to i64\n" + " %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom\n" + " store i32 %i, i32* %arrayidx, align 4\n" + " %inc = add nsw i32 %i, 1\n" + " %cmp = icmp slt i32 %inc, %ub\n" + " br i1 %cmp, label %for.latch, label %for.exit\n" + "for.latch:\n" + " br label %for.body\n" + "for.exit:\n" + " br label %for.end\n" + "for.end:\n" + " ret void\n" + "}\n"; + + // Parse the module. + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleStr); + + runWithLoopInfoPlus( + *M, "foo", + [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) { + Function::iterator FI = F.begin(); + // First two basic block are entry and for.preheader - skip them. + ++FI; + BasicBlock *Header = &*(++FI); + BasicBlock *Latch = &*(++FI); + assert(Header && "No header"); + Loop *L = LI.getLoopFor(Header); + EXPECT_NE(L, nullptr); + EXPECT_TRUE(L->isLoopSimplifyForm()); + EXPECT_EQ(L->getLoopLatch(), Latch); + EXPECT_FALSE(L->isLoopExiting(Latch)); + // No loop guard becuase loop is not exiting on latch. + EXPECT_EQ(L->getLoopGuardBranch(), nullptr); + EXPECT_FALSE(L->isGuarded()); + }); +} + // Examine getUniqueExitBlocks/getUniqueNonLatchExitBlocks functions. TEST(LoopInfoTest, LoopUniqueExitBlocks) { const char *ModuleStr = diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt index 6bb2fb8eb9239e..9384bdad0434af 100644 --- a/llvm/unittests/CMakeLists.txt +++ b/llvm/unittests/CMakeLists.txt @@ -31,8 +31,9 @@ add_subdirectory(Remarks) add_subdirectory(Passes) add_subdirectory(ProfileData) add_subdirectory(Support) -add_subdirectory(TextAPI) +add_subdirectory(TableGen) add_subdirectory(Target) +add_subdirectory(TextAPI) add_subdirectory(Transforms) add_subdirectory(XRay) add_subdirectory(tools) diff --git a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.cpp b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.cpp index 23f8a691c8ffef..c5d7dc2fdc9c3e 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.cpp @@ -145,7 +145,7 @@ void JITLinkTestCommon::TestJITLinkContext::notifyFailed(Error Err) { void JITLinkTestCommon::TestJITLinkContext::lookup( const DenseSet &Symbols, - JITLinkAsyncLookupContinuation LookupContinuation) { + std::unique_ptr LC) { jitlink::AsyncLookupResult LookupResult; DenseSet MissingSymbols; for (const auto &Symbol : Symbols) { @@ -157,7 +157,7 @@ void JITLinkTestCommon::TestJITLinkContext::lookup( } if (MissingSymbols.empty()) - LookupContinuation(std::move(LookupResult)); + LC->run(std::move(LookupResult)); else { std::string ErrMsg; { @@ -167,12 +167,12 @@ void JITLinkTestCommon::TestJITLinkContext::lookup( ErrMsgStream << " " << Sym; ErrMsgStream << " ]\n"; } - LookupContinuation( + LC->run( make_error(std::move(ErrMsg), inconvertibleErrorCode())); } } -void JITLinkTestCommon::TestJITLinkContext::notifyResolved(AtomGraph &G) { +void JITLinkTestCommon::TestJITLinkContext::notifyResolved(LinkGraph &G) { if (NotifyResolved) NotifyResolved(G); } @@ -186,7 +186,7 @@ void JITLinkTestCommon::TestJITLinkContext::notifyFinalized( Error JITLinkTestCommon::TestJITLinkContext::modifyPassConfig( const Triple &TT, PassConfiguration &Config) { if (TestCase) - Config.PostFixupPasses.push_back([&](AtomGraph &G) -> Error { + Config.PostFixupPasses.push_back([&](LinkGraph &G) -> Error { TestCase(G); return Error::success(); }); @@ -196,11 +196,11 @@ Error JITLinkTestCommon::TestJITLinkContext::modifyPassConfig( JITLinkTestCommon::JITLinkTestCommon() { initializeLLVMTargets(); } Expected> -JITLinkTestCommon::disassemble(const MCDisassembler &Dis, - jitlink::DefinedAtom &Atom, size_t Offset) { +JITLinkTestCommon::disassemble(const MCDisassembler &Dis, jitlink::Block &B, + size_t Offset) { ArrayRef InstBuffer( - reinterpret_cast(Atom.getContent().data()) + Offset, - Atom.getContent().size() - Offset); + reinterpret_cast(B.getContent().data()) + Offset, + B.getContent().size() - Offset); MCInst Inst; uint64_t InstSize; @@ -214,11 +214,9 @@ JITLinkTestCommon::disassemble(const MCDisassembler &Dis, return std::make_pair(Inst, InstSize); } -Expected -JITLinkTestCommon::decodeImmediateOperand(const MCDisassembler &Dis, - jitlink::DefinedAtom &Atom, - size_t OpIdx, size_t Offset) { - auto InstAndSize = disassemble(Dis, Atom, Offset); +Expected JITLinkTestCommon::decodeImmediateOperand( + const MCDisassembler &Dis, jitlink::Block &B, size_t OpIdx, size_t Offset) { + auto InstAndSize = disassemble(Dis, B, Offset); if (!InstAndSize) return InstAndSize.takeError(); diff --git a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.h b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.h index 8e1273ed91198e..5c90532d897aa0 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.h +++ b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestCommon.h @@ -77,9 +77,9 @@ class JITLinkTestCommon { class TestJITLinkContext : public jitlink::JITLinkContext { public: - using TestCaseFunction = std::function; + using TestCaseFunction = std::function; - using NotifyResolvedFunction = std::function; + using NotifyResolvedFunction = std::function; using NotifyFinalizedFunction = std::function)>; @@ -103,11 +103,11 @@ class JITLinkTestCommon { void notifyFailed(Error Err) override; - void - lookup(const DenseSet &Symbols, - jitlink::JITLinkAsyncLookupContinuation LookupContinuation) override; + void lookup( + const DenseSet &Symbols, + std::unique_ptr LC) override; - void notifyResolved(jitlink::AtomGraph &G) override; + void notifyResolved(jitlink::LinkGraph &G) override; void notifyFinalized( std::unique_ptr A) override; @@ -140,56 +140,60 @@ class JITLinkTestCommon { } template - static Expected readInt(jitlink::AtomGraph &G, jitlink::DefinedAtom &A, + static Expected readInt(jitlink::LinkGraph &G, jitlink::Block &B, size_t Offset = 0) { - if (Offset + sizeof(T) > A.getContent().size()) - return make_error("Reading past end of atom content", + if (Offset + sizeof(T) > B.getSize()) + return make_error("Reading past end of block content", inconvertibleErrorCode()); - return support::endian::read(A.getContent().data() + Offset, + return support::endian::read(B.getContent().data() + Offset, G.getEndianness()); } template - static Expected readInt(jitlink::AtomGraph &G, StringRef AtomName, + static Expected readInt(jitlink::LinkGraph &G, StringRef SymbolName, size_t Offset = 0) { - auto DA = G.findDefinedAtomByName(AtomName); - if (!DA) - return DA.takeError(); - return readInt(G, *DA); + for (auto *Sym : G.defined_symbols()) { + if (Sym->getName() == SymbolName) + return readInt(G, Sym->getBlock(), Sym->getOffset() + Offset); + } + return make_error("Symbol \"" + SymbolName + "\" not found", + inconvertibleErrorCode()); } static Expected> - disassemble(const MCDisassembler &Dis, jitlink::DefinedAtom &Atom, - size_t Offset = 0); + disassemble(const MCDisassembler &Dis, jitlink::Block &B, size_t Offset = 0); static Expected decodeImmediateOperand(const MCDisassembler &Dis, - jitlink::DefinedAtom &Atom, + jitlink::Block &B, size_t OpIdx, size_t Offset = 0); - static jitlink::Atom &atom(jitlink::AtomGraph &G, StringRef Name) { - return G.getAtomByName(Name); + static jitlink::Symbol &symbol(jitlink::LinkGraph &G, StringRef Name) { + for (auto *Sym : G.defined_symbols()) + if (Sym->getName() == Name) + return *Sym; + for (auto *Sym : G.external_symbols()) + if (Sym->getName() == Name) + return *Sym; + for (auto *Sym : G.absolute_symbols()) + if (Sym->getName() == Name) + return *Sym; + llvm_unreachable("Name must reference a symbol"); } - static jitlink::DefinedAtom &definedAtom(jitlink::AtomGraph &G, - StringRef Name) { - return G.getDefinedAtomByName(Name); - } - - static JITTargetAddress atomAddr(jitlink::AtomGraph &G, StringRef Name) { - return atom(G, Name).getAddress(); + static JITTargetAddress symbolAddr(jitlink::LinkGraph &G, StringRef Name) { + return symbol(G, Name).getAddress(); } template - static size_t countEdgesMatching(jitlink::DefinedAtom &DA, - const PredT &Pred) { - return std::count_if(DA.edges().begin(), DA.edges().end(), Pred); + static size_t countEdgesMatching(jitlink::Block &B, const PredT &Pred) { + return std::count_if(B.edges().begin(), B.edges().end(), Pred); } template - static size_t countEdgesMatching(jitlink::AtomGraph &G, StringRef Name, + static size_t countEdgesMatching(jitlink::LinkGraph &G, StringRef Name, const PredT &Pred) { - return countEdgesMatching(definedAtom(G, Name), Pred); + return countEdgesMatching(symbol(G, Name), Pred); } private: diff --git a/llvm/unittests/ExecutionEngine/JITLink/MachO_x86_64_Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/MachO_x86_64_Tests.cpp index e051ad551c757d..9b76edae49992e 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/MachO_x86_64_Tests.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/MachO_x86_64_Tests.cpp @@ -24,7 +24,7 @@ class JITLinkTest_MachO_x86_64 : public JITLinkTestCommon, public testing::Test { public: using BasicVerifyGraphFunction = - std::function; + std::function; void runBasicVerifyGraphTest(StringRef AsmSrc, StringRef Triple, StringMap Externals, @@ -40,7 +40,7 @@ class JITLinkTest_MachO_x86_64 : public JITLinkTestCommon, } auto JTCtx = std::make_unique( - **TR, [&](AtomGraph &G) { RunGraphTest(G, (*TR)->getDisassembler()); }); + **TR, [&](LinkGraph &G) { RunGraphTest(G, (*TR)->getDisassembler()); }); JTCtx->externals() = std::move(Externals); @@ -48,78 +48,77 @@ class JITLinkTest_MachO_x86_64 : public JITLinkTestCommon, } protected: - static void verifyIsPointerTo(AtomGraph &G, DefinedAtom &A, Atom &Target) { - EXPECT_EQ(A.edges_size(), 1U) << "Incorrect number of edges for pointer"; - if (A.edges_size() != 1U) + static void verifyIsPointerTo(LinkGraph &G, Block &B, Symbol &Target) { + EXPECT_EQ(B.edges_size(), 1U) << "Incorrect number of edges for pointer"; + if (B.edges_size() != 1U) return; - auto &E = *A.edges().begin(); + auto &E = *B.edges().begin(); + EXPECT_EQ(E.getOffset(), 0U) << "Expected edge offset of zero"; EXPECT_EQ(E.getKind(), Pointer64) << "Expected pointer to have a pointer64 relocation"; EXPECT_EQ(&E.getTarget(), &Target) << "Expected edge to point at target"; - EXPECT_THAT_EXPECTED(readInt(G, A), HasValue(Target.getAddress())) + EXPECT_THAT_EXPECTED(readInt(G, B), HasValue(Target.getAddress())) << "Pointer does not point to target"; } - static void verifyGOTLoad(AtomGraph &G, DefinedAtom &A, Edge &E, - Atom &Target) { + static void verifyGOTLoad(LinkGraph &G, Edge &E, Symbol &Target) { EXPECT_EQ(E.getAddend(), 0U) << "Expected GOT load to have a zero addend"; EXPECT_TRUE(E.getTarget().isDefined()) - << "GOT entry should be a defined atom"; + << "GOT entry should be a defined symbol"; if (!E.getTarget().isDefined()) return; - verifyIsPointerTo(G, static_cast(E.getTarget()), Target); + verifyIsPointerTo(G, E.getTarget().getBlock(), Target); } - static void verifyCall(const MCDisassembler &Dis, AtomGraph &G, - DefinedAtom &Caller, Edge &E, Atom &Callee) { + static void verifyCall(const MCDisassembler &Dis, LinkGraph &G, + Block &CallerBlock, Edge &E, Symbol &Callee) { EXPECT_EQ(E.getKind(), Branch32) << "Edge is not a Branch32"; EXPECT_EQ(E.getAddend(), 0U) << "Expected no addend on stub call"; EXPECT_EQ(&E.getTarget(), &Callee) << "Edge does not point at expected callee"; - JITTargetAddress FixupAddress = Caller.getAddress() + E.getOffset(); + JITTargetAddress FixupAddress = CallerBlock.getAddress() + E.getOffset(); uint64_t PCRelDelta = Callee.getAddress() - (FixupAddress + 4); EXPECT_THAT_EXPECTED( - decodeImmediateOperand(Dis, Caller, 0, E.getOffset() - 1), + decodeImmediateOperand(Dis, CallerBlock, 0, E.getOffset() - 1), HasValue(PCRelDelta)); } - static void verifyIndirectCall(const MCDisassembler &Dis, AtomGraph &G, - DefinedAtom &Caller, Edge &E, Atom &Callee) { + static void verifyIndirectCall(const MCDisassembler &Dis, LinkGraph &G, + Block &CallerBlock, Edge &E, Symbol &Callee) { EXPECT_EQ(E.getKind(), PCRel32) << "Edge is not a PCRel32"; EXPECT_EQ(E.getAddend(), 0) << "Expected no addend on stub cal"; - EXPECT_TRUE(E.getTarget().isDefined()) << "Target is not a defined atom"; + EXPECT_TRUE(E.getTarget().isDefined()) << "Target is not a defined symbol"; if (!E.getTarget().isDefined()) return; - verifyIsPointerTo(G, static_cast(E.getTarget()), Callee); + verifyIsPointerTo(G, E.getTarget().getBlock(), Callee); - JITTargetAddress FixupAddress = Caller.getAddress() + E.getOffset(); + JITTargetAddress FixupAddress = CallerBlock.getAddress() + E.getOffset(); uint64_t PCRelDelta = E.getTarget().getAddress() - (FixupAddress + 4); EXPECT_THAT_EXPECTED( - decodeImmediateOperand(Dis, Caller, 3, E.getOffset() - 2), + decodeImmediateOperand(Dis, CallerBlock, 3, E.getOffset() - 2), HasValue(PCRelDelta)); } - static void verifyCallViaStub(const MCDisassembler &Dis, AtomGraph &G, - DefinedAtom &Caller, Edge &E, Atom &Callee) { - verifyCall(Dis, G, Caller, E, E.getTarget()); + static void verifyCallViaStub(const MCDisassembler &Dis, LinkGraph &G, + Block &CallerBlock, Edge &E, Symbol &Callee) { + verifyCall(Dis, G, CallerBlock, E, E.getTarget()); if (!E.getTarget().isDefined()) { ADD_FAILURE() << "Edge target is not a stub"; return; } - auto &StubAtom = static_cast(E.getTarget()); - EXPECT_EQ(StubAtom.edges_size(), 1U) + auto &StubBlock = E.getTarget().getBlock(); + EXPECT_EQ(StubBlock.edges_size(), 1U) << "Expected one edge from stub to target"; - auto &StubEdge = *StubAtom.edges().begin(); + auto &StubEdge = *StubBlock.edges().begin(); - verifyIndirectCall(Dis, G, static_cast(StubAtom), StubEdge, - Callee); + verifyIndirectCall(Dis, G, StubBlock, StubEdge, Callee); } }; @@ -161,24 +160,24 @@ TEST_F(JITLinkTest_MachO_x86_64, BasicRelocations) { {{"_y", JITEvaluatedSymbol(0xdeadbeef, JITSymbolFlags::Exported)}, {"_baz", JITEvaluatedSymbol(0xcafef00d, JITSymbolFlags::Exported)}}, true, false, MCTargetOptions(), - [](AtomGraph &G, const MCDisassembler &Dis) { - // Name the atoms in the asm above. - auto &Baz = atom(G, "_baz"); - auto &Y = atom(G, "_y"); - - auto &Bar = definedAtom(G, "_bar"); - auto &Foo = definedAtom(G, "_foo"); - auto &Foo_1 = definedAtom(G, "_foo.1"); - auto &Foo_2 = definedAtom(G, "_foo.2"); - auto &X = definedAtom(G, "_x"); - auto &P = definedAtom(G, "_p"); + [](LinkGraph &G, const MCDisassembler &Dis) { + // Name the symbols in the asm above. + auto &Baz = symbol(G, "_baz"); + auto &Y = symbol(G, "_y"); + auto &Bar = symbol(G, "_bar"); + auto &Foo = symbol(G, "_foo"); + auto &Foo_1 = symbol(G, "_foo.1"); + auto &Foo_2 = symbol(G, "_foo.2"); + auto &X = symbol(G, "_x"); + auto &P = symbol(G, "_p"); // Check unsigned reloc for _p { - EXPECT_EQ(P.edges_size(), 1U) << "Unexpected number of relocations"; - EXPECT_EQ(P.edges().begin()->getKind(), Pointer64) + EXPECT_EQ(P.getBlock().edges_size(), 1U) + << "Unexpected number of relocations"; + EXPECT_EQ(P.getBlock().edges().begin()->getKind(), Pointer64) << "Unexpected edge kind for _p"; - EXPECT_THAT_EXPECTED(readInt(G, P), + EXPECT_THAT_EXPECTED(readInt(G, P.getBlock()), HasValue(X.getAddress())) << "Unsigned relocation did not apply correctly"; } @@ -188,41 +187,45 @@ TEST_F(JITLinkTest_MachO_x86_64, BasicRelocations) { // indirect call, and that the pointer for the indirect call points to // baz. { - EXPECT_EQ(Bar.edges_size(), 1U) + EXPECT_EQ(Bar.getBlock().edges_size(), 1U) << "Incorrect number of edges for bar"; - EXPECT_EQ(Bar.edges().begin()->getKind(), Branch32) + EXPECT_EQ(Bar.getBlock().edges().begin()->getKind(), Branch32) << "Unexpected edge kind for _bar"; - verifyCallViaStub(Dis, G, Bar, *Bar.edges().begin(), Baz); + verifyCallViaStub(Dis, G, Bar.getBlock(), + *Bar.getBlock().edges().begin(), Baz); } // Check that _foo is a direct call to _bar. { - EXPECT_EQ(Foo.edges_size(), 1U) + EXPECT_EQ(Foo.getBlock().edges_size(), 1U) << "Incorrect number of edges for foo"; - EXPECT_EQ(Foo.edges().begin()->getKind(), Branch32); - verifyCall(Dis, G, Foo, *Foo.edges().begin(), Bar); + EXPECT_EQ(Foo.getBlock().edges().begin()->getKind(), Branch32); + verifyCall(Dis, G, Foo.getBlock(), *Foo.getBlock().edges().begin(), + Bar); } // Check .got load in _foo.1 { - EXPECT_EQ(Foo_1.edges_size(), 1U) + EXPECT_EQ(Foo_1.getBlock().edges_size(), 1U) << "Incorrect number of edges for foo_1"; - EXPECT_EQ(Foo_1.edges().begin()->getKind(), PCRel32); - verifyGOTLoad(G, Foo_1, *Foo_1.edges().begin(), Y); + EXPECT_EQ(Foo_1.getBlock().edges().begin()->getKind(), PCRel32); + verifyGOTLoad(G, *Foo_1.getBlock().edges().begin(), Y); } // Check PCRel ref to _p in _foo.2 { - EXPECT_EQ(Foo_2.edges_size(), 1U) + EXPECT_EQ(Foo_2.getBlock().edges_size(), 1U) << "Incorrect number of edges for foo_2"; - EXPECT_EQ(Foo_2.edges().begin()->getKind(), PCRel32); + EXPECT_EQ(Foo_2.getBlock().edges().begin()->getKind(), PCRel32); JITTargetAddress FixupAddress = - Foo_2.getAddress() + Foo_2.edges().begin()->getOffset(); + Foo_2.getBlock().getAddress() + + Foo_2.getBlock().edges().begin()->getOffset(); uint64_t PCRelDelta = P.getAddress() - (FixupAddress + 4); - EXPECT_THAT_EXPECTED(decodeImmediateOperand(Dis, Foo_2, 4, 0), - HasValue(PCRelDelta)) + EXPECT_THAT_EXPECTED( + decodeImmediateOperand(Dis, Foo_2.getBlock(), 4, 0), + HasValue(PCRelDelta)) << "PCRel load does not reference expected target"; } }); diff --git a/llvm/unittests/Support/FileCheckTest.cpp b/llvm/unittests/Support/FileCheckTest.cpp index 3fbc06b467be85..66465f104eb5e3 100644 --- a/llvm/unittests/Support/FileCheckTest.cpp +++ b/llvm/unittests/Support/FileCheckTest.cpp @@ -62,10 +62,9 @@ uint64_t doAdd(uint64_t OpL, uint64_t OpR) { return OpL + OpR; } TEST_F(FileCheckTest, NumericVariable) { // Undefined variable: getValue and eval fail, error returned by eval holds // the name of the undefined variable. - FileCheckNumericVariable FooVar = FileCheckNumericVariable("FOO", 1); + FileCheckNumericVariable FooVar("FOO", 1); EXPECT_EQ("FOO", FooVar.getName()); - FileCheckNumericVariableUse FooVarUse = - FileCheckNumericVariableUse("FOO", &FooVar); + FileCheckNumericVariableUse FooVarUse("FOO", &FooVar); EXPECT_FALSE(FooVar.getValue()); Expected EvalResult = FooVarUse.eval(); ASSERT_FALSE(EvalResult); @@ -91,16 +90,15 @@ TEST_F(FileCheckTest, NumericVariable) { } TEST_F(FileCheckTest, Binop) { - FileCheckNumericVariable FooVar = FileCheckNumericVariable("FOO", 1); + FileCheckNumericVariable FooVar("FOO", 1); FooVar.setValue(42); std::unique_ptr FooVarUse = std::make_unique("FOO", &FooVar); - FileCheckNumericVariable BarVar = FileCheckNumericVariable("BAR", 2); + FileCheckNumericVariable BarVar("BAR", 2); BarVar.setValue(18); std::unique_ptr BarVarUse = std::make_unique("BAR", &BarVar); - FileCheckASTBinop Binop = - FileCheckASTBinop(doAdd, std::move(FooVarUse), std::move(BarVarUse)); + FileCheckASTBinop Binop(doAdd, std::move(FooVarUse), std::move(BarVarUse)); // Defined variable: eval returns right value. Expected Value = Binop.eval(); @@ -217,8 +215,7 @@ class PatternTester { SourceMgr SM; FileCheckRequest Req; FileCheckPatternContext Context; - FileCheckPattern P = - FileCheckPattern(Check::CheckPlain, &Context, LineNumber++); + FileCheckPattern P{Check::CheckPlain, &Context, LineNumber++}; public: PatternTester() { @@ -409,25 +406,24 @@ TEST_F(FileCheckTest, Substitution) { // Substitution of an undefined string variable fails and error holds that // variable's name. - FileCheckStringSubstitution StringSubstitution = - FileCheckStringSubstitution(&Context, "VAR404", 42); + FileCheckStringSubstitution StringSubstitution(&Context, "VAR404", 42); Expected SubstValue = StringSubstitution.getResult(); ASSERT_FALSE(bool(SubstValue)); expectUndefError("VAR404", SubstValue.takeError()); // Substitutions of defined pseudo and non-pseudo numeric variables return // the right value. - FileCheckNumericVariable LineVar = FileCheckNumericVariable("@LINE", 1); - FileCheckNumericVariable NVar = FileCheckNumericVariable("N", 1); + FileCheckNumericVariable LineVar("@LINE", 1); + FileCheckNumericVariable NVar("N", 1); LineVar.setValue(42); NVar.setValue(10); auto LineVarUse = std::make_unique("@LINE", &LineVar); auto NVarUse = std::make_unique("N", &NVar); - FileCheckNumericSubstitution SubstitutionLine = FileCheckNumericSubstitution( - &Context, "@LINE", std::move(LineVarUse), 12); - FileCheckNumericSubstitution SubstitutionN = - FileCheckNumericSubstitution(&Context, "N", std::move(NVarUse), 30); + FileCheckNumericSubstitution SubstitutionLine(&Context, "@LINE", + std::move(LineVarUse), 12); + FileCheckNumericSubstitution SubstitutionN(&Context, "N", std::move(NVarUse), + 30); SubstValue = SubstitutionLine.getResult(); ASSERT_TRUE(bool(SubstValue)); EXPECT_EQ("42", *SubstValue); @@ -447,7 +443,7 @@ TEST_F(FileCheckTest, Substitution) { expectUndefError("N", SubstValue.takeError()); // Substitution of a defined string variable returns the right value. - FileCheckPattern P = FileCheckPattern(Check::CheckPlain, &Context, 1); + FileCheckPattern P(Check::CheckPlain, &Context, 1); StringSubstitution = FileCheckStringSubstitution(&Context, "FOO", 42); SubstValue = StringSubstitution.getResult(); ASSERT_TRUE(bool(SubstValue)); @@ -455,7 +451,7 @@ TEST_F(FileCheckTest, Substitution) { } TEST_F(FileCheckTest, FileCheckContext) { - FileCheckPatternContext Cxt = FileCheckPatternContext(); + FileCheckPatternContext Cxt; std::vector GlobalDefines; SourceMgr SM; @@ -518,7 +514,7 @@ TEST_F(FileCheckTest, FileCheckContext) { StringRef EmptyVarStr = "EmptyVar"; StringRef UnknownVarStr = "UnknownVar"; Expected LocalVar = Cxt.getPatternVarValue(LocalVarStr); - FileCheckPattern P = FileCheckPattern(Check::CheckPlain, &Cxt, 1); + FileCheckPattern P(Check::CheckPlain, &Cxt, 1); Optional DefinedNumericVariable; Expected> ExpressionAST = P.parseNumericSubstitutionBlock(LocalNumVar1Ref, DefinedNumericVariable, diff --git a/llvm/unittests/TableGen/Automata.td b/llvm/unittests/TableGen/Automata.td new file mode 100644 index 00000000000000..80671e5e61cbee --- /dev/null +++ b/llvm/unittests/TableGen/Automata.td @@ -0,0 +1,186 @@ +include "llvm/TableGen/Automaton.td" +include "llvm/TableGen/SearchableTable.td" + +// Define a set of input token symbols. +class SymKindTy; +def SK_a : SymKindTy; +def SK_b : SymKindTy; +def SK_c : SymKindTy; +def SK_d : SymKindTy; + +// Emit those as a C++ enum using SearchableTables. +def SymKind : GenericEnum { + let FilterClass = "SymKindTy"; +} + +// Define a transition implementation. +class SimpleTransition State, SymKindTy A> : Transition { + let NewState{1-0} = State; + SymKindTy ActionSym = A; +} + +// Token SK_a sets bit 0b01. +def : SimpleTransition<0b01, SK_a>; +// Token SK_b sets bits 0b10. +def : SimpleTransition<0b10, SK_b>; +// Token SK_c sets both bits 0b11. +def : SimpleTransition<0b11, SK_c>; + +def SimpleAutomaton : GenericAutomaton { + let TransitionClass = "SimpleTransition"; + let SymbolFields = ["ActionSym"]; + // Override the type of ActionSym from SymKindTy to the C++ type SymKind. + string TypeOf_ActionSym = "SymKind"; +} + +//===----------------------------------------------------------------------===// +// TupleActionAutomaton test implementation + +// Define a transition implementation. +class TupleTransition State, SymKindTy s1, SymKindTy s2, string s3> : Transition { + let NewState{1-0} = State; + SymKindTy S1 = s1; + SymKindTy S2 = s2; + string S3 = s3; +} + +def : TupleTransition<0b01, SK_a, SK_b, "yeet">; +def : TupleTransition<0b10, SK_b, SK_b, "foo">; +def : TupleTransition<0b10, SK_c, SK_a, "foo">; + +def TupleAutomaton : GenericAutomaton { + let TransitionClass = "TupleTransition"; + let SymbolFields = ["S1", "S2", "S3"]; + string TypeOf_S1 = "SymKind"; + string TypeOf_S2 = "SymKind"; +} + +//===----------------------------------------------------------------------===// +// NfaAutomaton test implementation + +class NfaTransition State, SymKindTy S> : Transition { + let NewState{1-0} = State; + SymKindTy A = S; +} + +// Symbols a and b can transition to 0b01 or 0b11 (sets bit 0). +def : NfaTransition<0b01, SK_a>; +def : NfaTransition<0b01, SK_b>; +// Symbols a and b can also transition to 0b10 or 0b11 (sets bit 1). +def : NfaTransition<0b10, SK_a>; +def : NfaTransition<0b10, SK_b>; + +def NfaAutomaton : GenericAutomaton { + let TransitionClass = "NfaTransition"; + let SymbolFields = ["A"]; + string TypeOf_A = "SymKind"; +} + +//===----------------------------------------------------------------------===// +// BinPacker test implementation +//===----------------------------------------------------------------------===// +// This test generates an automaton that can pack values into bins subject to +// constraints. There are 6 possible bins, and the input tokens are constraint +// types. Some input types span two bins. + +// The symbol type for a bin constraint. We use lists of ints as a tblgen hack +// to conditionally generate defs within multiclasses based on record +// information. A bin is nonempty (has a dummy one-element value) if enabled. +class BinRequirementKind { + list Bin0 = []; + list Bin1 = []; + list Bin2 = []; + list Bin3 = []; + list Bin4 = []; + list Bin5 = []; +} +// Can use bins {0-3} +def BRK_0_to_4 : BinRequirementKind { let Bin0 = [1]; let Bin1 = [1]; let Bin2 = [1]; let Bin3 = [1]; } +// Can use bins {0-3} but only evens (0 and 2). +def BRK_0_to_4_lo : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; } +// Can use bins {0-3} but only odds (1 and 3). +def BRK_0_to_4_hi : BinRequirementKind { let Bin1 = [1]; let Bin3 = [1]; } +// Can use bins {0-3} but only even-odd pairs (0+1 or 1+2). +def BRK_0_to_4_dbl : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; } +def BRK_0_to_6 : BinRequirementKind { let Bin0 = [1]; let Bin1 = [1]; let Bin2 = [1]; + let Bin3 = [1]; let Bin4 = [1]; let Bin5 = [1]; } +def BRK_0_to_6_lo : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; let Bin4 = [1]; } +def BRK_0_to_6_hi : BinRequirementKind { let Bin1 = [1]; let Bin3 = [1]; let Bin5 = [1]; } +def BRK_0_to_6_dbl : BinRequirementKind { let Bin0 = [1]; let Bin2 = [1]; let Bin4 = [1]; } +def BRK_2_to_6 : BinRequirementKind { let Bin2 = [1]; + let Bin3 = [1]; let Bin4 = [1]; let Bin5 = [1]; } +def BRK_2_to_6_lo : BinRequirementKind { let Bin2 = [1]; let Bin4 = [1]; } +def BRK_2_to_6_hi : BinRequirementKind { let Bin3 = [1]; let Bin5 = [1];} +def BRK_2_to_6_dbl : BinRequirementKind { let Bin2 = [1]; let Bin4 = [1]; } +def BRK_2_to_4 : BinRequirementKind { let Bin2 = [1]; let Bin3 = [1]; } +def BRK_2_to_4_lo : BinRequirementKind { let Bin2 = [1]; } +def BRK_2_to_4_hi : BinRequirementKind { let Bin3 = [1]; } +def BRK_2_to_4_dbl : BinRequirementKind { let Bin2 = [1]; } + +def BinRequirementKindEnum : GenericEnum { + let FilterClass = "BinRequirementKind"; +} + +// The transition class is trivial; it just contains the constraint symbol. +class BinTransition : Transition { + BinRequirementKind Sym; +} + +// Mixin that occupies a single bin. +class Bin0 : BinTransition { let NewState{0} = 1; } +class Bin1 : BinTransition { let NewState{1} = 1; } +class Bin2 : BinTransition { let NewState{2} = 1;} +class Bin3 : BinTransition { let NewState{3} = 1; } +class Bin4 : BinTransition { let NewState{4} = 1;} +class Bin5 : BinTransition { let NewState{5} = 1; } +// Mixin that occupies a pair of bins (even-odd pairs). +class Bin01 : BinTransition { let NewState{0,1} = 0b11; } +class Bin23 : BinTransition { let NewState{2,3} = 0b11; } +class Bin45 : BinTransition { let NewState{4,5} = 0b11; } + +// Instantiate all possible bin assignments for E. +multiclass BinAssignments { + let Sym = E in { + // Note the tablegen hack to conditionally instantiate a def based on E. + foreach x = E.Bin0 in { def : Bin0; } + foreach x = E.Bin1 in { def : Bin1; } + foreach x = E.Bin2 in { def : Bin2; } + foreach x = E.Bin3 in { def : Bin3; } + foreach x = E.Bin4 in { def : Bin4; } + foreach x = E.Bin5 in { def : Bin5; } + } +} + +// Instantiate all possible bin assignments for E, which spans even-odd pairs. +multiclass DblBinAssignments { + let Sym = E in { + foreach x = E.Bin0 in { def : Bin01; } + foreach x = E.Bin2 in { def : Bin23; } + foreach x = E.Bin4 in { def : Bin45; } + } +} + +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : BinAssignments; +defm : DblBinAssignments; +defm : BinAssignments; +defm : BinAssignments; + +def BinPackerAutomaton : GenericAutomaton { + let TransitionClass = "BinTransition"; + let SymbolFields = ["Sym"]; + string TypeOf_Sym = "BinRequirementKindEnum"; +} + + diff --git a/llvm/unittests/TableGen/AutomataTest.cpp b/llvm/unittests/TableGen/AutomataTest.cpp new file mode 100644 index 00000000000000..fb19716c484992 --- /dev/null +++ b/llvm/unittests/TableGen/AutomataTest.cpp @@ -0,0 +1,153 @@ +//===- unittest/TableGen/AutomataTest.cpp - DFA tests ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Automaton.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; +using testing::ContainerEq; +using testing::UnorderedElementsAre; + +// Bring in the enums created by SearchableTables.td. +#define GET_SymKind_DECL +#define GET_BinRequirementKindEnum_DECL +#include "AutomataTables.inc" + +// And bring in the automata from Automata.td. +#define GET_SimpleAutomaton_DECL +#define GET_TupleAutomaton_DECL +#define GET_NfaAutomaton_DECL +#define GET_BinPackerAutomaton_DECL +#include "AutomataAutomata.inc" + +TEST(Automata, SimpleAutomatonAcceptsFromInitialState) { + Automaton A(makeArrayRef(SimpleAutomatonTransitions)); + EXPECT_TRUE(A.add(SK_a)); + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + A.reset(); + EXPECT_TRUE(A.add(SK_c)); + A.reset(); + EXPECT_FALSE(A.add(SK_d)); +} + +TEST(Automata, SimpleAutomatonAcceptsSequences) { + Automaton A(makeArrayRef(SimpleAutomatonTransitions)); + // Test sequence + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_TRUE(A.add(SK_b)); + + // Test sequence is rejected (c cannot get bit 0b10); + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_FALSE(A.add(SK_c)); + + // Symmetric test: sequence is rejected. + A.reset(); + EXPECT_TRUE(A.add(SK_c)); + EXPECT_FALSE(A.add(SK_a)); +} + +TEST(Automata, TupleAutomatonAccepts) { + Automaton A(makeArrayRef(TupleAutomatonTransitions)); + A.reset(); + EXPECT_TRUE( + A.add(TupleAutomatonAction{SK_a, SK_b, "yeet"})); + A.reset(); + EXPECT_FALSE( + A.add(TupleAutomatonAction{SK_a, SK_a, "yeet"})); + A.reset(); + EXPECT_FALSE( + A.add(TupleAutomatonAction{SK_a, SK_b, "feet"})); + A.reset(); + EXPECT_TRUE( + A.add(TupleAutomatonAction{SK_b, SK_b, "foo"})); +} + +TEST(Automata, NfaAutomatonAccepts) { + Automaton A(makeArrayRef(NfaAutomatonTransitions)); + + // Test sequences , , , . All should be accepted. + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_TRUE(A.add(SK_a)); + A.reset(); + EXPECT_TRUE(A.add(SK_a)); + EXPECT_TRUE(A.add(SK_b)); + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_TRUE(A.add(SK_a)); + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_TRUE(A.add(SK_b)); + + // Expect that is not accepted. + A.reset(); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_TRUE(A.add(SK_b)); + EXPECT_FALSE(A.add(SK_b)); +} + +TEST(Automata, BinPackerAutomatonAccepts) { + Automaton A(makeArrayRef(BinPackerAutomatonTransitions)); + + // Expect that we can pack two double-bins in 0-4, then no more in 0-4. + A.reset(); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_FALSE(A.add(BRK_0_to_4)); + + // Expect that we can pack two double-bins in 0-4, two more in 0-6 then no + // more. + A.reset(); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_FALSE(A.add(BRK_0_to_6)); + + // Expect that we can pack BRK_0_to_6 five times to occupy five bins, then + // cannot allocate any double-bins. + A.reset(); + for (unsigned I = 0; I < 5; ++I) + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_FALSE(A.add(BRK_0_to_6_dbl)); +} + +// The state we defined in TableGen uses the least significant 6 bits to represent a bin state. +#define BINS(a, b, c, d, e, f) \ + ((a << 5) | (b << 4) | (c << 3) | (d << 2) | (e << 1) | (f << 0)) + +TEST(Automata, BinPackerAutomatonExplains) { + Automaton A(makeArrayRef(BinPackerAutomatonTransitions), + makeArrayRef(BinPackerAutomatonTransitionInfo)); + // Pack two double-bins in 0-4, then a single bin in 0-6. + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_4_dbl)); + EXPECT_TRUE(A.add(BRK_0_to_6)); + EXPECT_THAT( + A.getNfaPaths(), + UnorderedElementsAre( + // Allocate {0,1} first, then 6. + ContainerEq(NfaPath{BINS(0, 0, 0, 0, 1, 1), BINS(0, 0, 1, 1, 1, 1), + BINS(1, 0, 1, 1, 1, 1)}), + // Allocate {0,1} first, then 5. + ContainerEq(NfaPath{BINS(0, 0, 0, 0, 1, 1), BINS(0, 0, 1, 1, 1, 1), + BINS(0, 1, 1, 1, 1, 1)}), + // Allocate {2,3} first, then 6. + ContainerEq(NfaPath{BINS(0, 0, 1, 1, 0, 0), BINS(0, 0, 1, 1, 1, 1), + BINS(1, 0, 1, 1, 1, 1)}), + // Allocate {2,3} first, then 5. + ContainerEq(NfaPath{BINS(0, 0, 1, 1, 0, 0), BINS(0, 0, 1, 1, 1, 1), + BINS(0, 1, 1, 1, 1, 1)}))); +} diff --git a/llvm/unittests/TableGen/CMakeLists.txt b/llvm/unittests/TableGen/CMakeLists.txt new file mode 100644 index 00000000000000..328ba56691d7ed --- /dev/null +++ b/llvm/unittests/TableGen/CMakeLists.txt @@ -0,0 +1,17 @@ +set(LLVM_LINK_COMPONENTS + TableGen + Support + ) + +set(LLVM_TARGET_DEFINITIONS Automata.td) + +tablegen(LLVM AutomataTables.inc -gen-searchable-tables) +tablegen(LLVM AutomataAutomata.inc -gen-automata) +add_public_tablegen_target(AutomataTestTableGen) + +add_llvm_unittest(TableGenTests + CodeExpanderTest.cpp + AutomataTest.cpp + ) +include_directories(${CMAKE_SOURCE_DIR}/utils/TableGen) +target_link_libraries(TableGenTests PRIVATE LLVMTableGenGlobalISel LLVMTableGen) diff --git a/llvm/unittests/TableGen/CodeExpanderTest.cpp b/llvm/unittests/TableGen/CodeExpanderTest.cpp new file mode 100644 index 00000000000000..75b9b737370738 --- /dev/null +++ b/llvm/unittests/TableGen/CodeExpanderTest.cpp @@ -0,0 +1,203 @@ +//===- llvm/unittest/TableGen/CodeExpanderTest.cpp - Tests ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "GlobalISel/CodeExpander.h" +#include "GlobalISel/CodeExpansions.h" + +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Error.h" +#include "gtest/gtest.h" + +using namespace llvm; + +static StringRef bufferize(StringRef Str) { + std::unique_ptr Buffer = + MemoryBuffer::getMemBufferCopy(Str, "TestBuffer"); + StringRef StrBufferRef = Buffer->getBuffer(); + SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); + return StrBufferRef; +} + +class RAIIDiagnosticChecker { + std::string EmittedDiags; + raw_string_ostream OS; + std::vector Expected; + std::vector Received; + +public: + RAIIDiagnosticChecker() : OS(EmittedDiags) { + SrcMgr.setDiagHandler(handler, this); + } + ~RAIIDiagnosticChecker() { + SrcMgr.setDiagHandler(nullptr); + EXPECT_EQ(Received.size(), Expected.size()); + for (unsigned i = 0; i < Received.size() && i < Expected.size(); ++i) { + EXPECT_EQ(Received[i].getLoc(), Expected[i].getLoc()); + EXPECT_EQ(Received[i].getFilename(), Expected[i].getFilename()); + EXPECT_EQ(Received[i].getKind(), Expected[i].getKind()); + EXPECT_EQ(Received[i].getLineNo(), Expected[i].getLineNo()); + EXPECT_EQ(Received[i].getColumnNo(), Expected[i].getColumnNo()); + EXPECT_EQ(Received[i].getMessage(), Expected[i].getMessage()); + EXPECT_EQ(Received[i].getLineContents(), Expected[i].getLineContents()); + EXPECT_EQ(Received[i].getRanges(), Expected[i].getRanges()); + } + + if (testing::Test::HasFailure()) + errs() << "Emitted diagnostic:\n" << OS.str(); + } + + void expect(SMDiagnostic D) { Expected.push_back(D); } + + void diag(const SMDiagnostic &D) { + Received.push_back(D); + } + + static void handler(const SMDiagnostic &D, void *Context) { + RAIIDiagnosticChecker *Self = static_cast(Context); + Self->diag(D); + SrcMgr.setDiagHandler(nullptr); + SrcMgr.PrintMessage(Self->OS, D); + SrcMgr.setDiagHandler(handler, Context); + }; +}; + +TEST(CodeExpander, NoExpansions) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("No expansions", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "No expansions"); +} + +// Indentation is applied to all lines except the first +TEST(CodeExpander, Indentation) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("No expansions\nsecond line\nthird line", Expansions, SMLoc(), + false, " ") + .emit(OS); + EXPECT_EQ(OS.str(), "No expansions\n second line\n third line"); +} + +// \ is an escape character that removes special meanings from the next +// character. +TEST(CodeExpander, Escape) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\\\\\a\\$", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "\\a$"); +} + +// $foo is not an expansion. It should warn though. +TEST(CodeExpander, NotAnExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + StringRef In = bufferize(" $foo"); + CodeExpander(" $foo", Expansions, SMLoc::getFromPointer(In.data()), false) + .emit(OS); + EXPECT_EQ(OS.str(), " $foo"); + DiagChecker.expect(SMDiagnostic( + SrcMgr, SMLoc::getFromPointer(In.data() + 1), "TestBuffer", 1, 1, + SourceMgr::DK_Warning, "Assuming missing escape character", " $foo", {})); +} + +// \$foo is not an expansion but shouldn't warn as it's using the escape. +TEST(CodeExpander, EscapedNotAnExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\$foo", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "$foo"); +} + +// \${foo is not an expansion but shouldn't warn as it's using the escape. +TEST(CodeExpander, EscapedUnterminatedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\${foo", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "${foo"); +} + +// \${foo is not an expansion but shouldn't warn as it's using the escape. +TEST(CodeExpander, EscapedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\${foo}", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "${foo}"); +} + +// ${foo} is an undefined expansion and should error. +TEST(CodeExpander, UndefinedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + Expansions.declare("bar", "expansion"); + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("${foo}${bar}", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "expansion"); + DiagChecker.expect( + SMDiagnostic(SrcMgr, SMLoc(), "", 0, -1, SourceMgr::DK_Error, + "Attempting to expand an undeclared variable foo", "", {})); +} + +// ${foo} is an undefined expansion and should error. When given a valid +// location for the start of the buffer it should correctly point at the +// expansion being performed. +TEST(CodeExpander, UndefinedExpansionWithLoc) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + Expansions.declare("bar", "expansion"); + + RAIIDiagnosticChecker DiagChecker; + StringRef In = bufferize("Padding ${foo}${bar}"); + CodeExpander(In, Expansions, SMLoc::getFromPointer(In.data()), false) + .emit(OS); + EXPECT_EQ(OS.str(), "Padding expansion"); + DiagChecker.expect(SMDiagnostic( + SrcMgr, SMLoc::getFromPointer(In.data() + 8), "TestBuffer", 1, 8, + SourceMgr::DK_Error, "Attempting to expand an undeclared variable foo", + "Padding ${foo}${bar}", {})); +} + +// ${bar is an unterminated expansion. Warn and implicitly terminate it. +TEST(CodeExpander, UnterminatedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + Expansions.declare("bar", "expansion"); + + RAIIDiagnosticChecker DiagChecker; + StringRef In = bufferize(" ${bar"); + CodeExpander(In, Expansions, SMLoc::getFromPointer(In.data()), false) + .emit(OS); + EXPECT_EQ(OS.str(), " expansion"); + DiagChecker.expect(SMDiagnostic(SrcMgr, SMLoc::getFromPointer(In.data() + 1), + "TestBuffer", 1, 1, SourceMgr::DK_Warning, + "Unterminated expansion", " ${bar", {})); +} diff --git a/llvm/unittests/Target/AArch64/TestStackOffset.cpp b/llvm/unittests/Target/AArch64/TestStackOffset.cpp index 240cec9f2d0b31..c85135ef660587 100644 --- a/llvm/unittests/Target/AArch64/TestStackOffset.cpp +++ b/llvm/unittests/Target/AArch64/TestStackOffset.cpp @@ -20,6 +20,15 @@ TEST(StackOffset, MixedSize) { StackOffset C(2, MVT::v4i64); EXPECT_EQ(64, C.getBytes()); + + StackOffset D(2, MVT::nxv4i64); + EXPECT_EQ(64, D.getScalableBytes()); + + StackOffset E(2, MVT::v4i64); + EXPECT_EQ(0, E.getScalableBytes()); + + StackOffset F(2, MVT::nxv4i64); + EXPECT_EQ(0, F.getBytes()); } TEST(StackOffset, Add) { @@ -31,6 +40,11 @@ TEST(StackOffset, Add) { StackOffset D(1, MVT::i32); D += A; EXPECT_EQ(12, D.getBytes()); + + StackOffset E(1, MVT::nxv1i32); + StackOffset F = C + E; + EXPECT_EQ(12, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, Sub) { @@ -42,6 +56,12 @@ TEST(StackOffset, Sub) { StackOffset D(1, MVT::i64); D -= A; EXPECT_EQ(0, D.getBytes()); + + C += StackOffset(2, MVT::nxv1i32); + StackOffset E = StackOffset(1, MVT::nxv1i32); + StackOffset F = C - E; + EXPECT_EQ(4, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, isZero) { @@ -49,12 +69,63 @@ TEST(StackOffset, isZero) { StackOffset B(0, MVT::i32); EXPECT_TRUE(!A); EXPECT_TRUE(!(A + B)); + + StackOffset C(0, MVT::nxv1i32); + EXPECT_TRUE(!(A + C)); + + StackOffset D(1, MVT::nxv1i32); + EXPECT_FALSE(!(A + D)); +} + +TEST(StackOffset, isValid) { + EXPECT_FALSE(StackOffset(1, MVT::nxv8i1).isValid()); + EXPECT_TRUE(StackOffset(2, MVT::nxv8i1).isValid()); + +#ifndef NDEBUG +#ifdef GTEST_HAS_DEATH_TEST + EXPECT_DEATH(StackOffset(1, MVT::i1), + "Offset type is not a multiple of bytes"); + EXPECT_DEATH(StackOffset(1, MVT::nxv1i1), + "Offset type is not a multiple of bytes"); +#endif // defined GTEST_HAS_DEATH_TEST +#endif // not defined NDEBUG } TEST(StackOffset, getForFrameOffset) { StackOffset A(1, MVT::i64); StackOffset B(1, MVT::i32); - int64_t ByteSized; - (A + B).getForFrameOffset(ByteSized); + StackOffset C(1, MVT::nxv4i32); + + // If all offsets can be materialized with only ADDVL, + // make sure PLSized is 0. + int64_t ByteSized, VLSized, PLSized; + (A + B + C).getForFrameOffset(ByteSized, PLSized, VLSized); EXPECT_EQ(12, ByteSized); + EXPECT_EQ(1, VLSized); + EXPECT_EQ(0, PLSized); + + // If we need an ADDPL to materialize the offset, and the number of scalable + // bytes fits the ADDPL immediate, fold the scalable bytes to fit in PLSized. + StackOffset D(1, MVT::nxv16i1); + (C + D).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(9, PLSized); + + StackOffset E(4, MVT::nxv4i32); + StackOffset F(1, MVT::nxv16i1); + (E + F).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(33, PLSized); + + // If the offset requires an ADDPL instruction to materialize, and would + // require more than two instructions, decompose it into both + // ADDVL (n x 16 bytes) and ADDPL (n x 2 bytes) instructions. + StackOffset G(8, MVT::nxv4i32); + StackOffset H(1, MVT::nxv16i1); + (G + H).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(8, VLSized); + EXPECT_EQ(1, PLSized); } diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp index 8b86951fa5e199..9213be726970c6 100644 --- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp +++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp @@ -8,6 +8,7 @@ #include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/AsmParser/Parser.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -225,4 +226,55 @@ TEST(CodeExtractor, StoreOutputInvokeResultInExitStub) { EXPECT_FALSE(verifyFunction(*Func)); } +TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) { + LLVMContext Ctx; + SMDiagnostic Err; + std::unique_ptr M(parseAssemblyString(R"ir( + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + %b = type { i64 } + declare void @g(i8*) + + declare void @llvm.assume(i1) #0 + + define void @test() { + entry: + br label %label + + label: + %0 = load %b*, %b** inttoptr (i64 8 to %b**), align 8 + %1 = getelementptr inbounds %b, %b* %0, i64 undef, i32 0 + %2 = load i64, i64* %1, align 8 + %3 = icmp ugt i64 %2, 1 + br i1 %3, label %if.then, label %if.else + + if.then: + unreachable + + if.else: + call void @g(i8* undef) + store i64 undef, i64* null, align 536870912 + %4 = icmp eq i64 %2, 0 + call void @llvm.assume(i1 %4) + unreachable + } + + attributes #0 = { nounwind willreturn } + )ir", + Err, Ctx)); + + assert(M && "Could not parse module?"); + Function *Func = M->getFunction("test"); + SmallVector Blocks{ getBlockByName(Func, "if.else") }; + AssumptionCache AC(*Func); + CodeExtractor CE(Blocks, nullptr, false, nullptr, nullptr, &AC); + EXPECT_TRUE(CE.isEligible()); + + Function *Outlined = CE.extractCodeRegion(); + EXPECT_TRUE(Outlined); + EXPECT_FALSE(verifyFunction(*Outlined)); + EXPECT_FALSE(verifyFunction(*Func)); + EXPECT_FALSE(CE.verifyAssumptionCache(*Func, &AC)); +} } // end anonymous namespace diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index 55f04e98d77dc1..1f67a1ec84c766 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -867,12 +867,42 @@ TEST(Local, RemoveUnreachableBlocks) { bb2: br label %bb1 } + + declare i32 @__gxx_personality_v0(...) + + define void @invoke_terminator() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + entry: + br i1 undef, label %invoke.block, label %exit + + invoke.block: + %cond = invoke zeroext i1 @invokable() + to label %continue.block unwind label %lpad.block + + continue.block: + br i1 %cond, label %if.then, label %if.end + + if.then: + unreachable + + if.end: + unreachable + + lpad.block: + %lp = landingpad { i8*, i32 } + catch i8* null + br label %exit + + exit: + ret void + } + + declare i1 @invokable() )"); auto runEager = [&](Function &F, DominatorTree *DT) { PostDominatorTree PDT = PostDominatorTree(F); DomTreeUpdater DTU(*DT, PDT, DomTreeUpdater::UpdateStrategy::Eager); - removeUnreachableBlocks(F, nullptr, &DTU); + removeUnreachableBlocks(F, &DTU); EXPECT_TRUE(DTU.getDomTree().verify()); EXPECT_TRUE(DTU.getPostDomTree().verify()); }; @@ -880,7 +910,7 @@ TEST(Local, RemoveUnreachableBlocks) { auto runLazy = [&](Function &F, DominatorTree *DT) { PostDominatorTree PDT = PostDominatorTree(F); DomTreeUpdater DTU(*DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy); - removeUnreachableBlocks(F, nullptr, &DTU); + removeUnreachableBlocks(F, &DTU); EXPECT_TRUE(DTU.getDomTree().verify()); EXPECT_TRUE(DTU.getPostDomTree().verify()); }; @@ -890,12 +920,14 @@ TEST(Local, RemoveUnreachableBlocks) { runWithDomTree(*M, "br_self_loop", runEager); runWithDomTree(*M, "br_constant", runEager); runWithDomTree(*M, "br_loop", runEager); + runWithDomTree(*M, "invoke_terminator", runEager); // Test removeUnreachableBlocks under Lazy UpdateStrategy. runWithDomTree(*M, "br_simple", runLazy); runWithDomTree(*M, "br_self_loop", runLazy); runWithDomTree(*M, "br_constant", runLazy); runWithDomTree(*M, "br_loop", runLazy); + runWithDomTree(*M, "invoke_terminator", runLazy); M = parseIR(C, R"( @@ -909,8 +941,8 @@ TEST(Local, RemoveUnreachableBlocks) { auto checkRUBlocksRetVal = [&](Function &F, DominatorTree *DT) { DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - EXPECT_TRUE(removeUnreachableBlocks(F, nullptr, &DTU)); - EXPECT_FALSE(removeUnreachableBlocks(F, nullptr, &DTU)); + EXPECT_TRUE(removeUnreachableBlocks(F, &DTU)); + EXPECT_FALSE(removeUnreachableBlocks(F, &DTU)); EXPECT_TRUE(DTU.getDomTree().verify()); }; diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt index c88365a2b8cecf..407e10d8bf2d60 100644 --- a/llvm/utils/TableGen/CMakeLists.txt +++ b/llvm/utils/TableGen/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(GlobalISel) + set(LLVM_LINK_COMPONENTS Support) add_tablegen(llvm-tblgen LLVM @@ -19,11 +21,13 @@ add_tablegen(llvm-tblgen LLVM DAGISelMatcherGen.cpp DAGISelMatcherOpt.cpp DAGISelMatcher.cpp + DFAEmitter.cpp DFAPacketizerEmitter.cpp DisassemblerEmitter.cpp ExegesisEmitter.cpp FastISelEmitter.cpp FixedLenDecoderEmitter.cpp + GICombinerEmitter.cpp GlobalISelEmitter.cpp InfoByHwMode.cpp InstrInfoEmitter.cpp @@ -49,4 +53,5 @@ add_tablegen(llvm-tblgen LLVM WebAssemblyDisassemblerEmitter.cpp CTagsEmitter.cpp ) +target_link_libraries(llvm-tblgen PRIVATE LLVMTableGenGlobalISel) set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning") diff --git a/llvm/utils/TableGen/CodeGenInstruction.cpp b/llvm/utils/TableGen/CodeGenInstruction.cpp index 2463824469abfa..fde946d065891f 100644 --- a/llvm/utils/TableGen/CodeGenInstruction.cpp +++ b/llvm/utils/TableGen/CodeGenInstruction.cpp @@ -363,6 +363,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R) Namespace = R->getValueAsString("Namespace"); AsmString = R->getValueAsString("AsmString"); + isPreISelOpcode = R->getValueAsBit("isPreISelOpcode"); isReturn = R->getValueAsBit("isReturn"); isEHScopeReturn = R->getValueAsBit("isEHScopeReturn"); isBranch = R->getValueAsBit("isBranch"); diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h index bb5b1369649f55..2cb28425df7aa4 100644 --- a/llvm/utils/TableGen/CodeGenInstruction.h +++ b/llvm/utils/TableGen/CodeGenInstruction.h @@ -231,6 +231,7 @@ template class ArrayRef; std::vector ImplicitDefs, ImplicitUses; // Various boolean values we track for the instruction. + bool isPreISelOpcode : 1; bool isReturn : 1; bool isEHScopeReturn : 1; bool isBranch : 1; diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp index cb05f78fba411c..f12d7d484a8eaa 100644 --- a/llvm/utils/TableGen/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/CodeGenSchedule.cpp @@ -1083,9 +1083,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { if (RWD->getValueAsDef("SchedModel") == RWModelDef && RWModelDef->getValueAsBit("FullInstRWOverlapCheck")) { for (Record *Inst : InstDefs) { - PrintFatalError(InstRWDef->getLoc(), "Overlapping InstRW def " + - Inst->getName() + " also matches " + - RWD->getValue("Instrs")->getValue()->getAsString()); + PrintFatalError + (InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + Inst->getName() + + "\" also matches previous \"" + + RWD->getValue("Instrs")->getValue()->getAsString() + + "\"."); } } } @@ -1115,9 +1119,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { for (Record *OldRWDef : SchedClasses[OldSCIdx].InstRWs) { if (OldRWDef->getValueAsDef("SchedModel") == RWModelDef) { for (Record *InstDef : InstDefs) { - PrintFatalError(OldRWDef->getLoc(), "Overlapping InstRW def " + - InstDef->getName() + " also matches " + - OldRWDef->getValue("Instrs")->getValue()->getAsString()); + PrintFatalError + (InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + InstDef->getName() + + "\" also matches previous \"" + + OldRWDef->getValue("Instrs")->getValue()->getAsString() + + "\"."); } } assert(OldRWDef != InstRWDef && diff --git a/llvm/utils/TableGen/DFAEmitter.cpp b/llvm/utils/TableGen/DFAEmitter.cpp new file mode 100644 index 00000000000000..dd3db7c150ba68 --- /dev/null +++ b/llvm/utils/TableGen/DFAEmitter.cpp @@ -0,0 +1,394 @@ +//===- DFAEmitter.cpp - Finite state automaton emitter --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This class can produce a generic deterministic finite state automaton (DFA), +// given a set of possible states and transitions. +// +// The input transitions can be nondeterministic - this class will produce the +// deterministic equivalent state machine. +// +// The generated code can run the DFA and produce an accepted / not accepted +// state and also produce, given a sequence of transitions that results in an +// accepted state, the sequence of intermediate states. This is useful if the +// initial automaton was nondeterministic - it allows mapping back from the DFA +// to the NFA. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "dfa-emitter" + +#include "DFAEmitter.h" +#include "CodeGenTarget.h" +#include "SequenceToOffsetTable.h" +#include "TableGenBackends.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/UniqueVector.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Record.h" +#include "llvm/TableGen/TableGenBackend.h" +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// DfaEmitter implementation. This is independent of the GenAutomaton backend. +//===----------------------------------------------------------------------===// + +void DfaEmitter::addTransition(state_type From, state_type To, action_type A) { + Actions.insert(A); + NfaStates.insert(From); + NfaStates.insert(To); + NfaTransitions[{From, A}].push_back(To); + ++NumNfaTransitions; +} + +void DfaEmitter::visitDfaState(DfaState DS) { + // For every possible action... + auto FromId = DfaStates.idFor(DS); + for (action_type A : Actions) { + DfaState NewStates; + DfaTransitionInfo TI; + // For every represented state, word pair in the original NFA... + for (state_type &FromState : DS) { + // If this action is possible from this state add the transitioned-to + // states to NewStates. + auto I = NfaTransitions.find({FromState, A}); + if (I == NfaTransitions.end()) + continue; + for (state_type &ToState : I->second) { + NewStates.push_back(ToState); + TI.emplace_back(FromState, ToState); + } + } + if (NewStates.empty()) + continue; + // Sort and unique. + sort(NewStates); + NewStates.erase(std::unique(NewStates.begin(), NewStates.end()), + NewStates.end()); + sort(TI); + TI.erase(std::unique(TI.begin(), TI.end()), TI.end()); + unsigned ToId = DfaStates.insert(NewStates); + DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI)); + } +} + +void DfaEmitter::constructDfa() { + DfaState Initial(1, /*NFA initial state=*/0); + DfaStates.insert(Initial); + + // Note that UniqueVector starts indices at 1, not zero. + unsigned DfaStateId = 1; + while (DfaStateId <= DfaStates.size()) + visitDfaState(DfaStates[DfaStateId++]); +} + +void DfaEmitter::emit(StringRef Name, raw_ostream &OS) { + constructDfa(); + + OS << "// Input NFA has " << NfaStates.size() << " states with " + << NumNfaTransitions << " transitions.\n"; + OS << "// Generated DFA has " << DfaStates.size() << " states with " + << DfaTransitions.size() << " transitions.\n\n"; + + // Implementation note: We don't bake a simple std::pair<> here as it requires + // significantly more effort to parse. A simple test with a large array of + // struct-pairs (N=100000) took clang-10 6s to parse. The same array of + // std::pair took 242s. Instead we allow the user to + // define the pair type. + // + // FIXME: It may make sense to emit these as ULEB sequences instead of + // pairs of uint64_t. + OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n"; + OS << "// transition implies a set of NFA transitions. These are referred\n"; + OS << "// to by index in " << Name << "Transitions[].\n"; + + SequenceToOffsetTable Table; + std::map EmittedIndices; + for (auto &T : DfaTransitions) + Table.add(T.second.second); + Table.layout(); + OS << "std::array " << Name + << "TransitionInfo = {{\n"; + Table.emit( + OS, + [](raw_ostream &OS, std::pair P) { + OS << "{" << P.first << ", " << P.second << "}"; + }, + "{0ULL, 0ULL}"); + + OS << "}};\n\n"; + + OS << "// A transition in the generated " << Name << " DFA.\n"; + OS << "struct " << Name << "Transition {\n"; + OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n"; + OS << " "; + printActionType(OS); + OS << " Action; // The input symbol that causes this transition.\n"; + OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n"; + OS << " unsigned InfoIdx; // Start index into " << Name + << "TransitionInfo.\n"; + OS << "};\n\n"; + + OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n"; + OS << "// The initial state is 1, not zero.\n"; + OS << "std::array<" << Name << "Transition, " << DfaTransitions.size() << "> " + << Name << "Transitions = {{\n"; + for (auto &KV : DfaTransitions) { + dfa_state_type From = KV.first.first; + dfa_state_type To = KV.second.first; + action_type A = KV.first.second; + unsigned InfoIdx = Table.get(KV.second.second); + OS << " {" << From << ", "; + printActionValue(A, OS); + OS << ", " << To << ", " << InfoIdx << "},\n"; + } + OS << "\n}};\n\n"; +} + +void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; } + +void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; } + +//===----------------------------------------------------------------------===// +// AutomatonEmitter implementation +//===----------------------------------------------------------------------===// + +namespace { +// FIXME: This entire discriminated union could be removed with c++17: +// using Action = std::variant; +struct Action { + Record *R = nullptr; + unsigned I = 0; + std::string S = nullptr; + + Action() = default; + Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {} + + void print(raw_ostream &OS) const { + if (R) + OS << R->getName(); + else if (!S.empty()) + OS << '"' << S << '"'; + else + OS << I; + } + bool operator<(const Action &Other) const { + return std::make_tuple(R, I, S) < + std::make_tuple(Other.R, Other.I, Other.S); + } +}; + +using ActionTuple = std::vector; +class Automaton; + +class Transition { + uint64_t NewState; + // The tuple of actions that causes this transition. + ActionTuple Actions; + // The types of the actions; this is the same across all transitions. + SmallVector Types; + +public: + Transition(Record *R, Automaton *Parent); + const ActionTuple &getActions() { return Actions; } + SmallVector getTypes() { return Types; } + + bool canTransitionFrom(uint64_t State); + uint64_t transitionFrom(uint64_t State); +}; + +class Automaton { + RecordKeeper &Records; + Record *R; + std::vector Transitions; + /// All possible action tuples, uniqued. + UniqueVector Actions; + /// The fields within each Transition object to find the action symbols. + std::vector ActionSymbolFields; + +public: + Automaton(RecordKeeper &Records, Record *R); + void emit(raw_ostream &OS); + + ArrayRef getActionSymbolFields() { return ActionSymbolFields; } + /// If the type of action A has been overridden (there exists a field + /// "TypeOf_A") return that, otherwise return the empty string. + StringRef getActionSymbolType(StringRef A); +}; + +class AutomatonEmitter { + RecordKeeper &Records; + +public: + AutomatonEmitter(RecordKeeper &R) : Records(R) {} + void run(raw_ostream &OS); +}; + +/// A DfaEmitter implementation that can print our variant action type. +class CustomDfaEmitter : public DfaEmitter { + const UniqueVector &Actions; + std::string TypeName; + +public: + CustomDfaEmitter(const UniqueVector &Actions, StringRef TypeName) + : Actions(Actions), TypeName(TypeName) {} + + void printActionType(raw_ostream &OS) override; + void printActionValue(action_type A, raw_ostream &OS) override; +}; +} // namespace + +void AutomatonEmitter::run(raw_ostream &OS) { + for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) { + Automaton A(Records, R); + OS << "#ifdef GET_" << R->getName() << "_DECL\n"; + A.emit(OS); + OS << "#endif // GET_" << R->getName() << "_DECL\n"; + } +} + +Automaton::Automaton(RecordKeeper &Records, Record *R) + : Records(Records), R(R) { + LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n"); + ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields"); +} + +void Automaton::emit(raw_ostream &OS) { + StringRef TransitionClass = R->getValueAsString("TransitionClass"); + for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) { + assert(T->isSubClassOf("Transition")); + Transitions.emplace_back(T, this); + Actions.insert(Transitions.back().getActions()); + } + + LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size() + << "\n"); + LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size() + << " potential transitions.\n"); + + StringRef Name = R->getName(); + + CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action"); + // Starting from the initial state, build up a list of possible states and + // transitions. + std::deque Worklist(1, 0); + std::set SeenStates; + unsigned NumTransitions = 0; + SeenStates.insert(Worklist.front()); + while (!Worklist.empty()) { + uint64_t State = Worklist.front(); + Worklist.pop_front(); + for (Transition &T : Transitions) { + if (!T.canTransitionFrom(State)) + continue; + uint64_t NewState = T.transitionFrom(State); + if (SeenStates.emplace(NewState).second) + Worklist.emplace_back(NewState); + ++NumTransitions; + Emitter.addTransition(State, NewState, Actions.idFor(T.getActions())); + } + } + LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size() + << " states with " << NumTransitions << " transitions.\n"); + + const auto &ActionTypes = Transitions.back().getTypes(); + OS << "// The type of an action in the " << Name << " automaton.\n"; + if (ActionTypes.size() == 1) { + OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n"; + } else { + OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ") + << ">;\n"; + } + OS << "\n"; + + Emitter.emit(Name, OS); +} + +StringRef Automaton::getActionSymbolType(StringRef A) { + Twine Ty = "TypeOf_" + A; + if (!R->getValue(Ty.str())) + return ""; + return R->getValueAsString(Ty.str()); +} + +Transition::Transition(Record *R, Automaton *Parent) { + BitsInit *NewStateInit = R->getValueAsBitsInit("NewState"); + NewState = 0; + assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 && + "State cannot be represented in 64 bits!"); + for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) { + if (auto *Bit = dyn_cast(NewStateInit->getBit(I))) { + if (Bit->getValue()) + NewState |= 1ULL << I; + } + } + + for (StringRef A : Parent->getActionSymbolFields()) { + RecordVal *SymbolV = R->getValue(A); + if (auto *Ty = dyn_cast(SymbolV->getType())) { + Actions.emplace_back(R->getValueAsDef(A), 0, ""); + Types.emplace_back(Ty->getAsString()); + } else if (isa(SymbolV->getType())) { + Actions.emplace_back(nullptr, R->getValueAsInt(A), ""); + Types.emplace_back("unsigned"); + } else if (isa(SymbolV->getType()) || + isa(SymbolV->getType())) { + Actions.emplace_back(nullptr, 0, R->getValueAsString(A)); + Types.emplace_back("std::string"); + } else { + report_fatal_error("Unhandled symbol type!"); + } + + StringRef TypeOverride = Parent->getActionSymbolType(A); + if (!TypeOverride.empty()) + Types.back() = TypeOverride; + } +} + +bool Transition::canTransitionFrom(uint64_t State) { + if ((State & NewState) == 0) + // The bits we want to set are not set; + return true; + return false; +} + +uint64_t Transition::transitionFrom(uint64_t State) { + return State | NewState; +} + +void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; } + +void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) { + const ActionTuple &AT = Actions[A]; + if (AT.size() > 1) + OS << "std::make_tuple("; + bool First = true; + for (const auto &SingleAction : AT) { + if (!First) + OS << ", "; + First = false; + SingleAction.print(OS); + } + if (AT.size() > 1) + OS << ")"; +} + +namespace llvm { + +void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) { + AutomatonEmitter(RK).run(OS); +} + +} // namespace llvm diff --git a/llvm/utils/TableGen/DFAEmitter.h b/llvm/utils/TableGen/DFAEmitter.h new file mode 100644 index 00000000000000..76de8f72cd88e6 --- /dev/null +++ b/llvm/utils/TableGen/DFAEmitter.h @@ -0,0 +1,107 @@ +//===--------------------- DfaEmitter.h -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Defines a generic automaton builder. This takes a set of transitions and +// states that represent a nondeterministic finite state automaton (NFA) and +// emits a determinized DFA in a form that include/llvm/Support/Automaton.h can +// drive. +// +// See file llvm/TableGen/Automaton.td for the TableGen API definition. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UTILS_TABLEGEN_DFAEMITTER_H +#define LLVM_UTILS_TABLEGEN_DFAEMITTER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/UniqueVector.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Record.h" +#include +#include + +namespace llvm { + +class raw_ostream; +/// Construct a deterministic finite state automaton from possible +/// nondeterministic state and transition data. +/// +/// The state type is a 64-bit unsigned integer. The generated automaton is +/// invariant to the sparsity of the state representation - its size is only +/// a function of the cardinality of the set of states. +/// +/// The inputs to this emitter are considered to define a nondeterministic +/// finite state automaton (NFA). This is then converted to a DFA during +/// emission. The emitted tables can be used to by +/// include/llvm/Support/Automaton.h. +class DfaEmitter { +public: + // The type of an NFA state. The initial state is always zero. + using state_type = uint64_t; + // The type of an action. + using action_type = uint64_t; + + DfaEmitter() = default; + virtual ~DfaEmitter() = default; + + void addTransition(state_type From, state_type To, action_type A); + void emit(StringRef Name, raw_ostream &OS); + +protected: + /// Emit the C++ type of an action to OS. + virtual void printActionType(raw_ostream &OS); + /// Emit the C++ value of an action A to OS. + virtual void printActionValue(action_type A, raw_ostream &OS); + +private: + /// The state type of deterministic states. These are only used internally to + /// this class. This is an ID into the DfaStates UniqueVector. + using dfa_state_type = unsigned; + + /// The actual representation of a DFA state, which is a union of one or more + /// NFA states. + using DfaState = SmallVector; + + /// A DFA transition consists of a set of NFA states transitioning to a + /// new set of NFA states. The DfaTransitionInfo tracks, for every + /// transitioned-from NFA state, a set of valid transitioned-to states. + /// + /// Emission of this transition relation allows algorithmic determination of + /// the possible candidate NFA paths taken under a given input sequence to + /// reach a given DFA state. + using DfaTransitionInfo = SmallVector, 4>; + + /// The set of all possible actions. + std::set Actions; + + /// The set of nondeterministic transitions. A state-action pair can + /// transition to multiple target states. + std::map, std::vector> + NfaTransitions; + std::set NfaStates; + unsigned NumNfaTransitions = 0; + + /// The set of deterministic states. DfaStates.getId(DfaState) returns an ID, + /// which is dfa_state_type. Note that because UniqueVector reserves state + /// zero, the initial DFA state is always 1. + UniqueVector DfaStates; + /// The set of deterministic transitions. A state-action pair has only a + /// single target state. + std::map, + std::pair> + DfaTransitions; + + /// Visit all NFA states and construct the DFA. + void constructDfa(); + /// Visit a single DFA state and construct all possible transitions to new DFA + /// states. + void visitDfaState(DfaState DS); +}; + +} // namespace llvm + +#endif diff --git a/llvm/utils/TableGen/GICombinerEmitter.cpp b/llvm/utils/TableGen/GICombinerEmitter.cpp new file mode 100644 index 00000000000000..c2b64bcfb7c5e3 --- /dev/null +++ b/llvm/utils/TableGen/GICombinerEmitter.cpp @@ -0,0 +1,105 @@ +//===- GlobalCombinerEmitter.cpp - Generate a combiner --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Generate a combiner implementation for GlobalISel from a declarative +/// syntax +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Timer.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/TableGenBackend.h" +#include "CodeGenTarget.h" + +using namespace llvm; + +#define DEBUG_TYPE "gicombiner-emitter" + +cl::OptionCategory + GICombinerEmitterCat("Options for -gen-global-isel-combiner"); +static cl::list + SelectedCombiners("combiners", cl::desc("Emit the specified combiners"), + cl::cat(GICombinerEmitterCat), cl::CommaSeparated); +static cl::opt ShowExpansions( + "gicombiner-show-expansions", + cl::desc("Use C++ comments to indicate occurence of code expansion"), + cl::cat(GICombinerEmitterCat)); + +namespace { +class GICombinerEmitter { + StringRef Name; + Record *Combiner; +public: + explicit GICombinerEmitter(RecordKeeper &RK, StringRef Name, + Record *Combiner); + ~GICombinerEmitter() {} + + StringRef getClassName() const { + return Combiner->getValueAsString("Classname"); + } + void run(raw_ostream &OS); + +}; + +GICombinerEmitter::GICombinerEmitter(RecordKeeper &RK, StringRef Name, + Record *Combiner) + : Name(Name), Combiner(Combiner) {} + +void GICombinerEmitter::run(raw_ostream &OS) { + NamedRegionTimer T("Emit", "Time spent emitting the combiner", + "Code Generation", "Time spent generating code", + TimeRegions); + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n" + << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n\n"; + + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n" + << "class " << getClassName() << " {\n" + << "public:\n" + << " bool tryCombineAll(\n" + << " GISelChangeObserver &Observer,\n" + << " MachineInstr &MI,\n" + << " MachineIRBuilder &B) const;\n" + << "};\n"; + OS << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n\n"; + + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n" + << "\n" + << "bool " << getClassName() << "::tryCombineAll(\n" + << " GISelChangeObserver &Observer,\n" + << " MachineInstr &MI,\n" + << " MachineIRBuilder &B) const {\n" + << " MachineBasicBlock *MBB = MI.getParent();\n" + << " MachineFunction *MF = MBB->getParent();\n" + << " MachineRegisterInfo &MRI = MF->getRegInfo();\n" + << " (void)MBB; (void)MF; (void)MRI;\n\n"; + OS << "\n return false;\n" + << "}\n" + << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n"; +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +namespace llvm { +void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS) { + CodeGenTarget Target(RK); + emitSourceFileHeader("Global Combiner", OS); + + if (SelectedCombiners.empty()) + PrintFatalError("No combiners selected with -combiners"); + for (const auto &Combiner : SelectedCombiners) { + Record *CombinerDef = RK.getDef(Combiner); + if (!CombinerDef) + PrintFatalError("Could not find " + Combiner); + GICombinerEmitter(RK, Combiner, CombinerDef).run(OS); + } +} + +} // namespace llvm diff --git a/llvm/utils/TableGen/GlobalISel/CMakeLists.txt b/llvm/utils/TableGen/GlobalISel/CMakeLists.txt new file mode 100644 index 00000000000000..2f74d1087bcd6b --- /dev/null +++ b/llvm/utils/TableGen/GlobalISel/CMakeLists.txt @@ -0,0 +1,7 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + +llvm_add_library(LLVMTableGenGlobalISel STATIC DISABLE_LLVM_LINK_LLVM_DYLIB + CodeExpander.cpp + ) diff --git a/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp b/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp new file mode 100644 index 00000000000000..d59a9b8e3b65eb --- /dev/null +++ b/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp @@ -0,0 +1,93 @@ +//===- CodeExpander.cpp - Expand variables in a string --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Expand the variables in a string. +// +//===----------------------------------------------------------------------===// + +#include "CodeExpander.h" +#include "CodeExpansions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Error.h" + +using namespace llvm; + +void CodeExpander::emit(raw_ostream &OS) const { + StringRef Current = Code; + + while (!Current.empty()) { + size_t Pos = Current.find_first_of("$\n\\"); + if (Pos == StringRef::npos) { + OS << Current; + Current = ""; + continue; + } + + OS << Current.substr(0, Pos); + Current = Current.substr(Pos); + + if (Current.startswith("\n")) { + OS << "\n" << Indent; + Current = Current.drop_front(1); + continue; + } + + if (Current.startswith("\\$") || Current.startswith("\\\\")) { + OS << Current[1]; + Current = Current.drop_front(2); + continue; + } + + if (Current.startswith("\\")) { + Current = Current.drop_front(1); + continue; + } + + if (Current.startswith("${")) { + StringRef StartVar = Current; + Current = Current.drop_front(2); + StringRef Var; + std::tie(Var, Current) = Current.split("}"); + + // Warn if we split because no terminator was found. + StringRef EndVar = StartVar.drop_front(2 /* ${ */ + Var.size()); + if (EndVar.empty()) { + size_t LocOffset = StartVar.data() - Code.data(); + PrintWarning( + Loc.size() > 0 && Loc[0].isValid() + ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) + : SMLoc(), + "Unterminated expansion"); + } + + auto ValueI = Expansions.find(Var); + if (ValueI == Expansions.end()) { + size_t LocOffset = StartVar.data() - Code.data(); + PrintError(Loc.size() > 0 && Loc[0].isValid() + ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) + : SMLoc(), + "Attempting to expand an undeclared variable " + Var); + } + if (ShowExpansions) + OS << "/*$" << Var << "{*/"; + OS << Expansions.lookup(Var); + if (ShowExpansions) + OS << "/*}*/"; + continue; + } + + size_t LocOffset = Current.data() - Code.data(); + PrintWarning(Loc.size() > 0 && Loc[0].isValid() + ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) + : SMLoc(), + "Assuming missing escape character"); + OS << "$"; + Current = Current.drop_front(1); + } +} diff --git a/llvm/utils/TableGen/GlobalISel/CodeExpander.h b/llvm/utils/TableGen/GlobalISel/CodeExpander.h new file mode 100644 index 00000000000000..bd6946de592589 --- /dev/null +++ b/llvm/utils/TableGen/GlobalISel/CodeExpander.h @@ -0,0 +1,55 @@ +//===- CodeExpander.h - Expand variables in a string ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Expand the variables in a string. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H +#define LLVM_UTILS_TABLEGEN_CODEEXPANDER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/SMLoc.h" + +namespace llvm { +class CodeExpansions; +class raw_ostream; + +/// Emit the given code with all '${foo}' placeholders expanded to their +/// replacements. +/// +/// It's an error to use an undefined expansion and expansion-like output that +/// needs to be emitted verbatim can be escaped as '\${foo}' +/// +/// The emitted code can be given a custom indent to enable both indentation by +/// an arbitrary amount of whitespace and emission of the code as a comment. +class CodeExpander { + StringRef Code; + const CodeExpansions &Expansions; + const ArrayRef &Loc; + bool ShowExpansions; + StringRef Indent; + +public: + CodeExpander(StringRef Code, const CodeExpansions &Expansions, + const ArrayRef &Loc, bool ShowExpansions, + StringRef Indent = " ") + : Code(Code), Expansions(Expansions), Loc(Loc), + ShowExpansions(ShowExpansions), Indent(Indent) {} + + void emit(raw_ostream &OS) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const CodeExpander &Expander) { + Expander.emit(OS); + return OS; +} +} // end namespace llvm + +#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H diff --git a/llvm/utils/TableGen/GlobalISel/CodeExpansions.h b/llvm/utils/TableGen/GlobalISel/CodeExpansions.h new file mode 100644 index 00000000000000..bb890ec8f57ef8 --- /dev/null +++ b/llvm/utils/TableGen/GlobalISel/CodeExpansions.h @@ -0,0 +1,43 @@ +//===- CodeExpansions.h - Record expansions for CodeExpander --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Record the expansions to use in a CodeExpander. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringMap.h" + +#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H +#define LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H +namespace llvm { +class CodeExpansions { +public: + using const_iterator = StringMap::const_iterator; + +protected: + StringMap Expansions; + +public: + void declare(StringRef Name, StringRef Expansion) { + bool Inserted = Expansions.try_emplace(Name, Expansion).second; + assert(Inserted && "Declared variable twice"); + (void)Inserted; + } + + std::string lookup(StringRef Variable) const { + return Expansions.lookup(Variable); + } + + const_iterator begin() const { return Expansions.begin(); } + const_iterator end() const { return Expansions.end(); } + const_iterator find(StringRef Variable) const { + return Expansions.find(Variable); + } +}; +} // end namespace llvm +#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index b7961efbf96366..300ba36a700740 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -662,6 +662,7 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num, CodeGenTarget &Target = CDP.getTargetInfo(); // Emit all of the target independent flags... + if (Inst.isPreISelOpcode) OS << "|(1ULL< &ArgCodes, Sig.push_back(IIT_SUBDIVIDE2_ARG); else if (R->isSubClassOf("LLVMSubdivide4VectorType")) Sig.push_back(IIT_SUBDIVIDE4_ARG); + else if (R->isSubClassOf("LLVMVectorOfBitcastsToInt")) + Sig.push_back(IIT_VEC_OF_BITCASTS_TO_INT); else Sig.push_back(IIT_ARG); return Sig.push_back((Number << 3) | 7 /*IITDescriptor::AK_MatchType*/); diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp index 29ef46fd7fccda..f730d91160ad5d 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/TableGen.cpp @@ -49,10 +49,12 @@ enum ActionType { GenAttributes, GenSearchableTables, GenGlobalISel, + GenGICombiner, GenX86EVEX2VEXTables, GenX86FoldTables, GenRegisterBank, GenExegesis, + GenAutomata, }; namespace llvm { @@ -62,75 +64,75 @@ bool TimeRegions = false; } // end namespace llvm namespace { - cl::opt - Action(cl::desc("Action to perform:"), - cl::values(clEnumValN(PrintRecords, "print-records", - "Print all records to stdout (default)"), - clEnumValN(DumpJSON, "dump-json", - "Dump all records as machine-readable JSON"), - clEnumValN(GenEmitter, "gen-emitter", - "Generate machine code emitter"), - clEnumValN(GenRegisterInfo, "gen-register-info", - "Generate registers and register classes info"), - clEnumValN(GenInstrInfo, "gen-instr-info", - "Generate instruction descriptions"), - clEnumValN(GenInstrDocs, "gen-instr-docs", - "Generate instruction documentation"), - clEnumValN(GenCallingConv, "gen-callingconv", - "Generate calling convention descriptions"), - clEnumValN(GenAsmWriter, "gen-asm-writer", - "Generate assembly writer"), - clEnumValN(GenDisassembler, "gen-disassembler", - "Generate disassembler"), - clEnumValN(GenPseudoLowering, "gen-pseudo-lowering", - "Generate pseudo instruction lowering"), - clEnumValN(GenCompressInst, "gen-compress-inst-emitter", - "Generate RISCV compressed instructions."), - clEnumValN(GenAsmMatcher, "gen-asm-matcher", - "Generate assembly instruction matcher"), - clEnumValN(GenDAGISel, "gen-dag-isel", - "Generate a DAG instruction selector"), - clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer", - "Generate DFA Packetizer for VLIW targets"), - clEnumValN(GenFastISel, "gen-fast-isel", - "Generate a \"fast\" instruction selector"), - clEnumValN(GenSubtarget, "gen-subtarget", - "Generate subtarget enumerations"), - clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums", - "Generate intrinsic enums"), - clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl", - "Generate intrinsic information"), - clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums", - "Generate target intrinsic enums"), - clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl", - "Generate target intrinsic information"), - clEnumValN(PrintEnums, "print-enums", - "Print enum values for a class"), - clEnumValN(PrintSets, "print-sets", - "Print expanded sets for testing DAG exprs"), - clEnumValN(GenOptParserDefs, "gen-opt-parser-defs", - "Generate option definitions"), - clEnumValN(GenCTags, "gen-ctags", - "Generate ctags-compatible index"), - clEnumValN(GenAttributes, "gen-attrs", - "Generate attributes"), - clEnumValN(GenSearchableTables, "gen-searchable-tables", - "Generate generic binary-searchable table"), - clEnumValN(GenGlobalISel, "gen-global-isel", - "Generate GlobalISel selector"), - clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables", - "Generate X86 EVEX to VEX compress tables"), - clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", - "Generate X86 fold tables"), - clEnumValN(GenRegisterBank, "gen-register-bank", - "Generate registers bank descriptions"), - clEnumValN(GenExegesis, "gen-exegesis", - "Generate llvm-exegesis tables"))); +cl::opt Action( + cl::desc("Action to perform:"), + cl::values( + clEnumValN(PrintRecords, "print-records", + "Print all records to stdout (default)"), + clEnumValN(DumpJSON, "dump-json", + "Dump all records as machine-readable JSON"), + clEnumValN(GenEmitter, "gen-emitter", "Generate machine code emitter"), + clEnumValN(GenRegisterInfo, "gen-register-info", + "Generate registers and register classes info"), + clEnumValN(GenInstrInfo, "gen-instr-info", + "Generate instruction descriptions"), + clEnumValN(GenInstrDocs, "gen-instr-docs", + "Generate instruction documentation"), + clEnumValN(GenCallingConv, "gen-callingconv", + "Generate calling convention descriptions"), + clEnumValN(GenAsmWriter, "gen-asm-writer", "Generate assembly writer"), + clEnumValN(GenDisassembler, "gen-disassembler", + "Generate disassembler"), + clEnumValN(GenPseudoLowering, "gen-pseudo-lowering", + "Generate pseudo instruction lowering"), + clEnumValN(GenCompressInst, "gen-compress-inst-emitter", + "Generate RISCV compressed instructions."), + clEnumValN(GenAsmMatcher, "gen-asm-matcher", + "Generate assembly instruction matcher"), + clEnumValN(GenDAGISel, "gen-dag-isel", + "Generate a DAG instruction selector"), + clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer", + "Generate DFA Packetizer for VLIW targets"), + clEnumValN(GenFastISel, "gen-fast-isel", + "Generate a \"fast\" instruction selector"), + clEnumValN(GenSubtarget, "gen-subtarget", + "Generate subtarget enumerations"), + clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums", + "Generate intrinsic enums"), + clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl", + "Generate intrinsic information"), + clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums", + "Generate target intrinsic enums"), + clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl", + "Generate target intrinsic information"), + clEnumValN(PrintEnums, "print-enums", "Print enum values for a class"), + clEnumValN(PrintSets, "print-sets", + "Print expanded sets for testing DAG exprs"), + clEnumValN(GenOptParserDefs, "gen-opt-parser-defs", + "Generate option definitions"), + clEnumValN(GenCTags, "gen-ctags", "Generate ctags-compatible index"), + clEnumValN(GenAttributes, "gen-attrs", "Generate attributes"), + clEnumValN(GenSearchableTables, "gen-searchable-tables", + "Generate generic binary-searchable table"), + clEnumValN(GenGlobalISel, "gen-global-isel", + "Generate GlobalISel selector"), + clEnumValN(GenGICombiner, "gen-global-isel-combiner", + "Generate GlobalISel combiner"), + clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables", + "Generate X86 EVEX to VEX compress tables"), + clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", + "Generate X86 fold tables"), + clEnumValN(GenRegisterBank, "gen-register-bank", + "Generate registers bank descriptions"), + clEnumValN(GenExegesis, "gen-exegesis", + "Generate llvm-exegesis tables"), + clEnumValN(GenAutomata, "gen-automata", + "Generate generic automata"))); - cl::OptionCategory PrintEnumsCat("Options for -print-enums"); - cl::opt - Class("class", cl::desc("Print Enum list for this class"), - cl::value_desc("class name"), cl::cat(PrintEnumsCat)); +cl::OptionCategory PrintEnumsCat("Options for -print-enums"); +cl::opt Class("class", cl::desc("Print Enum list for this class"), + cl::value_desc("class name"), + cl::cat(PrintEnumsCat)); cl::opt TimeRegionsOpt("time-regions", @@ -235,6 +237,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenGlobalISel: EmitGlobalISel(Records, OS); break; + case GenGICombiner: + EmitGICombiner(Records, OS); + break; case GenRegisterBank: EmitRegisterBank(Records, OS); break; @@ -247,6 +252,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenExegesis: EmitExegesis(Records, OS); break; + case GenAutomata: + EmitAutomata(Records, OS); + break; } return false; diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h index 135ec65c0f954c..8c067dd51b3b4f 100644 --- a/llvm/utils/TableGen/TableGenBackends.h +++ b/llvm/utils/TableGen/TableGenBackends.h @@ -85,10 +85,12 @@ void EmitCTags(RecordKeeper &RK, raw_ostream &OS); void EmitAttributes(RecordKeeper &RK, raw_ostream &OS); void EmitSearchableTables(RecordKeeper &RK, raw_ostream &OS); void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS); +void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS); void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS); void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS); void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS); void EmitExegesis(RecordKeeper &RK, raw_ostream &OS); +void EmitAutomata(RecordKeeper &RK, raw_ostream &OS); } // End llvm namespace diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py index 1eb354d8a46afb..81556d65802c9c 100644 --- a/llvm/utils/UpdateTestChecks/asm.py +++ b/llvm/utils/UpdateTestChecks/asm.py @@ -58,6 +58,12 @@ class string: # .Lfunc_end0: (mips64 - NewABI) flags=(re.M | re.S)) +ASM_FUNCTION_MSP430_RE = re.compile( + r'^_?(?P[^:]+):[ \t]*;+[ \t]*@(?P=func)\n[^:]*?' + r'(?P.*?)\n' + r'(\$|\.L)func_end[0-9]+:\n', # $func_end0: + flags=(re.M | re.S)) + ASM_FUNCTION_PPC_RE = re.compile( r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n' r'.*?' @@ -231,6 +237,16 @@ def scrub_asm_mips(asm, args): asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm) return asm +def scrub_asm_msp430(asm, args): + # Scrub runs of whitespace out of the assembly, but leave the leading + # whitespace in place. + asm = common.SCRUB_WHITESPACE_RE.sub(r' ', asm) + # Expand the tabs used for indentation. + asm = string.expandtabs(asm, 2) + # Strip trailing whitespace. + asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm) + return asm + def scrub_asm_riscv(asm, args): # Scrub runs of whitespace out of the assembly, but leave the leading # whitespace in place. @@ -315,6 +331,7 @@ def build_function_body_dictionary_for_triple(args, raw_tool_output, triple, pre 'thumbv5-macho': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE), 'thumbv7-apple-ios' : (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE), 'mips': (scrub_asm_mips, ASM_FUNCTION_MIPS_RE), + 'msp430': (scrub_asm_msp430, ASM_FUNCTION_MSP430_RE), 'ppc32': (scrub_asm_powerpc, ASM_FUNCTION_PPC_RE), 'powerpc': (scrub_asm_powerpc, ASM_FUNCTION_PPC_RE), 'riscv32': (scrub_asm_riscv, ASM_FUNCTION_RISCV_RE), diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index ecb3a0f0a72e29..972b65505b45ce 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -267,10 +267,12 @@ def add_checks(output_lines, comment_marker, prefix_list, func_dict, func_name, output_lines.append(comment_marker) break -def add_ir_checks(output_lines, comment_marker, prefix_list, func_dict, func_name): +def add_ir_checks(output_lines, comment_marker, prefix_list, func_dict, + func_name, preserve_names): # Label format is based on IR string. check_label_format = '{} %s-LABEL: @%s('.format(comment_marker) - add_checks(output_lines, comment_marker, prefix_list, func_dict, func_name, check_label_format, False, False) + add_checks(output_lines, comment_marker, prefix_list, func_dict, func_name, + check_label_format, False, preserve_names) def add_analyze_checks(output_lines, comment_marker, prefix_list, func_dict, func_name): check_label_format = '{} %s-LABEL: \'%s\''.format(comment_marker) diff --git a/llvm/utils/bisect b/llvm/utils/bisect index 0f9e53afa7f4a1..9df2cd9e1136bc 100755 --- a/llvm/utils/bisect +++ b/llvm/utils/bisect @@ -12,6 +12,7 @@ # And bisect will continually call ./script.sh with various counts using # the exit status to determine success and failure. # +from __future__ import print_function import os import sys import argparse @@ -34,10 +35,10 @@ print("End: %d" % end) last = None while start != end and start != end-1: - count = start + (end - start)/2 + count = start + (end - start)//2 print("Visiting Count: %d with (Start, End) = (%d,%d)" % (count, start, end)) cmd = [x % {'count':count} for x in args.command] - print cmd + print(cmd) result = subprocess.call(cmd) if result == 0: print(" PASSES! Setting start to count") diff --git a/llvm/utils/bisect-skip-count b/llvm/utils/bisect-skip-count index f4f8ddcec797ad..efdd2c937e1529 100755 --- a/llvm/utils/bisect-skip-count +++ b/llvm/utils/bisect-skip-count @@ -20,6 +20,7 @@ # result. Incrementing the last good count by one or decrementing the # last good skip by one should produce a failure. # +from __future__ import print_function import os import sys import argparse @@ -52,10 +53,10 @@ print("End: %d" % end) last = None while start != end and start != end-1: - count = start + (end - start)/2 + count = start + (end - start)//2 print("Visiting Skip: %d with (Start, End) = (%d,%d)" % (count, start, end)) cmd = [x % {'skip':count, 'count':-1} for x in args.command] - print cmd + print(cmd) try: result = subprocess.call(cmd, shell=args.shell, timeout=args.timeout) if result == 0: @@ -75,10 +76,10 @@ print("Bisect of Count starting!") print("Start: %d" % start) print("End: %d" % end) while start != end and start != end-1: - count = start + (end - start)/2 + count = start + (end - start)//2 print("Visiting Count: %d with (Start, End) = (%d,%d)" % (count, start, end)) cmd = [x % {'count':count, 'skip':firstcount } for x in args.command] - print cmd + print(cmd) try: result = subprocess.call(cmd, shell=args.shell, timeout=args.timeout) if result == 0: diff --git a/llvm/utils/gn/build/sync_source_lists_from_cmake.py b/llvm/utils/gn/build/sync_source_lists_from_cmake.py index 5063f71a826bad..4e64b0cac87906 100755 --- a/llvm/utils/gn/build/sync_source_lists_from_cmake.py +++ b/llvm/utils/gn/build/sync_source_lists_from_cmake.py @@ -155,6 +155,5 @@ def main(): sys.exit(1) - if __name__ == '__main__': main() diff --git a/llvm/utils/gn/build/toolchain/BUILD.gn b/llvm/utils/gn/build/toolchain/BUILD.gn index 70e259e116f4b5..c36579f28e4325 100644 --- a/llvm/utils/gn/build/toolchain/BUILD.gn +++ b/llvm/utils/gn/build/toolchain/BUILD.gn @@ -265,6 +265,7 @@ toolchain("win") { dllfile, libfile, ] + lib_switch = "" default_output_extension = ".dll" restat = true @@ -287,6 +288,7 @@ toolchain("win") { outputs = [ dllfile, ] + lib_switch = "" runtime_outputs = outputs default_output_extension = ".dll" @@ -302,6 +304,7 @@ toolchain("win") { outputs = [ outfile, ] + lib_switch = "" default_output_extension = ".exe" # Setting this allows targets to override the default executable output by diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn index a467d4947bffbe..5d36602f97efb4 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn @@ -18,6 +18,7 @@ static_library("cppcoreguidelines") { sources = [ "AvoidGotoCheck.cpp", "CppCoreGuidelinesTidyModule.cpp", + "InitVariablesCheck.cpp", "InterfacesGlobalInitCheck.cpp", "MacroUsageCheck.cpp", "NarrowingConversionsCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang/include/clang/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/AST/BUILD.gn index 49b7e1b2b93f9b..3bb292886db81d 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/AST/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/AST/BUILD.gn @@ -58,7 +58,6 @@ clang_tablegen("DeclNodes") { clang_tablegen("TypeNodes") { args = [ "-gen-clang-type-nodes" ] td_file = "../Basic/TypeNodes.td" - output_name = "TypeNodes.def" } clang_tablegen("CommentNodes") { diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn index f70195eb05554b..8f7c27ebf0b64f 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn @@ -103,6 +103,7 @@ static_library("CodeGen") { "MachineInstrBundle.cpp", "MachineLICM.cpp", "MachineLoopInfo.cpp", + "MachineLoopUtils.cpp", "MachineModuleInfo.cpp", "MachineModuleInfoImpls.cpp", "MachineOperand.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn index aa9894cbf86387..491707a52d2bf3 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn @@ -11,7 +11,7 @@ static_library("JITLink") { "JITLinkGeneric.cpp", "JITLinkMemoryManager.cpp", "MachO.cpp", - "MachOAtomGraphBuilder.cpp", + "MachOLinkGraphBuilder.cpp", "MachO_x86_64.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn index b05e0891d124e7..0c27c11e2e0ba0 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -30,6 +30,15 @@ tablegen("AArch64GenGlobalISel") { td_file = "AArch64.td" } +tablegen("AArch64GenGICombiner") { + visibility = [ ":LLVMAArch64CodeGen" ] + args = [ + "-gen-global-isel-combiner", + "-combiners=AArch64PreLegalizerCombinerHelper", + ] + td_file = "AArch64.td" +} + tablegen("AArch64GenMCPseudoLowering") { visibility = [ ":LLVMAArch64CodeGen" ] args = [ "-gen-pseudo-lowering" ] @@ -48,6 +57,7 @@ static_library("LLVMAArch64CodeGen") { ":AArch64GenCallingConv", ":AArch64GenDAGISel", ":AArch64GenFastISel", + ":AArch64GenGICombiner", ":AArch64GenGlobalISel", ":AArch64GenMCPseudoLowering", ":AArch64GenRegisterBank", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn index d6c256b52b43d6..ec7f3d81cba916 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn @@ -26,5 +26,6 @@ static_library("Instrumentation") { "PoisonChecking.cpp", "SanitizerCoverage.cpp", "ThreadSanitizer.cpp", + "ValueProfileCollector.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn index 721b4c4be11508..ee16b0a3a954c2 100644 --- a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn @@ -1,9 +1,18 @@ +import("//llvm/utils/TableGen/tablegen.gni") + +tablegen("Options") { + visibility = [ ":dsymutil" ] + args = [ "-gen-opt-parser-defs" ] +} + executable("dsymutil") { deps = [ + ":Options", "//llvm/lib/CodeGen/AsmPrinter", "//llvm/lib/DebugInfo/DWARF", "//llvm/lib/MC", "//llvm/lib/Object", + "//llvm/lib/Option", "//llvm/lib/Support", "//llvm/lib/Target", "//llvm/lib/Target:TargetsToBuild", diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn index 00483433122152..58334c8caf6373 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn @@ -40,7 +40,7 @@ write_cmake_config("BuildVariables.inc") { # lib/Support/Windows/Path.inc. # advapi32 required for CryptAcquireContextW in # lib/Support/Windows/Path.inc - system_libs = "psapi.lib shell32.lib ole32.lib uuid.lib advapi32" + system_libs = "psapi.lib shell32.lib ole32.lib uuid.lib advapi32.lib" } else { system_libs += "-lm" if (host_os == "linux") { diff --git a/llvm/utils/gn/secondary/llvm/triples.gni b/llvm/utils/gn/secondary/llvm/triples.gni index efe8be28513c67..558731295cee79 100644 --- a/llvm/utils/gn/secondary/llvm/triples.gni +++ b/llvm/utils/gn/secondary/llvm/triples.gni @@ -10,7 +10,7 @@ if (current_cpu == "x86") { } else if (current_os == "mac") { llvm_current_triple = "x86_64-apple-darwin" } else if (current_os == "win") { - llvm_current_triple = "x86_64-pc-windows" + llvm_current_triple = "x86_64-pc-windows-msvc" } } else if (current_cpu == "arm64") { if (current_os == "android") { diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn index f03456dca4a890..3d607dd7747117 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn @@ -34,6 +34,7 @@ group("unittests") { "Remarks:RemarksTests", "Support:SupportTests", "Support/DynamicLibrary:DynamicLibraryTests", + "TableGen:TableGenTests", "TextAPI:TextAPITests", "Transforms/IPO:IPOTests", "Transforms/Scalar:ScalarTests", diff --git a/llvm/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn new file mode 100644 index 00000000000000..dbd10e6d0913e7 --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn @@ -0,0 +1,29 @@ +import("//llvm/utils/TableGen/tablegen.gni") +import("//llvm/utils/unittest/unittest.gni") + +tablegen("AutomataAutomata") { + visibility = [ ":TableGenTests" ] + args = [ "-gen-automata" ] + td_file = "Automata.td" +} + +tablegen("AutomataTables") { + visibility = [ ":TableGenTests" ] + args = [ "-gen-searchable-tables" ] + td_file = "Automata.td" +} + +unittest("TableGenTests") { + deps = [ + ":AutomataAutomata", + ":AutomataTables", + "//llvm/lib/Support", + "//llvm/lib/TableGen", + "//llvm/utils/TableGen/GlobalISel", + ] + include_dirs = [ "//llvm/utils/TableGen" ] + sources = [ + "AutomataTest.cpp", + "CodeExpanderTest.cpp", + ] +} diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index 01219543d2db72..c50bebe71c0e05 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -4,6 +4,7 @@ executable("llvm-tblgen") { "//llvm/lib/MC", "//llvm/lib/Support", "//llvm/lib/TableGen", + "//llvm/utils/TableGen/GlobalISel", ] sources = [ "AsmMatcherEmitter.cpp", @@ -25,11 +26,13 @@ executable("llvm-tblgen") { "DAGISelMatcherEmitter.cpp", "DAGISelMatcherGen.cpp", "DAGISelMatcherOpt.cpp", + "DFAEmitter.cpp", "DFAPacketizerEmitter.cpp", "DisassemblerEmitter.cpp", "ExegesisEmitter.cpp", "FastISelEmitter.cpp", "FixedLenDecoderEmitter.cpp", + "GICombinerEmitter.cpp", "GlobalISelEmitter.cpp", "InfoByHwMode.cpp", "InstrDocsEmitter.cpp", diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn b/llvm/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn new file mode 100644 index 00000000000000..14955e5b98fcf1 --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn @@ -0,0 +1,8 @@ +static_library("GlobalISel") { + deps = [ + "//llvm/lib/Support", + ] + sources = [ + "CodeExpander.cpp", + ] +} diff --git a/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni b/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni index a0e2e7967d6b5a..cb588abbaa6a59 100644 --- a/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni +++ b/llvm/utils/gn/secondary/llvm/utils/TableGen/tablegen.gni @@ -64,11 +64,6 @@ template("tablegen") { depfile = "$gen_output.d" td_file = rebase_path(td_file, root_build_dir) - # FIXME: The cmake build lets tablegen write to a temp file and then copies - # it over the final output only if it has changed, for ninja's restat - # optimization. Instead of doing that in cmake, llvm-tblgen should do this - # itself. r330742 tried this, but it caused problems. Fix those and reland, - # so that the gn build has the optimization too. args = [ rebase_path(tblgen_executable, root_build_dir), "-I", diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py index 49aaf638a2213c..fc9c0b406c64e0 100755 --- a/llvm/utils/lit/lit/main.py +++ b/llvm/utils/lit/lit/main.py @@ -209,8 +209,8 @@ def main_with_tmp(builtinParameters): parser.add_argument("--version", dest="show_version", help="Show version and exit", action="store_true", default=False) - parser.add_argument("-j", "--threads", dest="numThreads", metavar="N", - help="Number of testing threads", + parser.add_argument("-j", "--threads", "--workers", dest="numWorkers", metavar="N", + help="Number of workers used for testing", type=int, default=None) parser.add_argument("--config-prefix", dest="configPrefix", metavar="NAME", help="Prefix for 'lit' config files", @@ -334,10 +334,10 @@ def main_with_tmp(builtinParameters): if not args: parser.error('No inputs specified') - if opts.numThreads is None: - opts.numThreads = lit.util.detectCPUs() - elif opts.numThreads <= 0: - parser.error("Option '--threads' or '-j' requires positive integer") + if opts.numWorkers is None: + opts.numWorkers = lit.util.detectCPUs() + elif opts.numWorkers <= 0: + parser.error("Option '--workers' or '-j' requires positive integer") if opts.maxFailures is not None and opts.maxFailures <= 0: parser.error("Option '--max-failures' requires positive integer") @@ -480,8 +480,8 @@ def main_with_tmp(builtinParameters): if opts.maxTests is not None: run.tests = run.tests[:opts.maxTests] - # Don't create more threads than tests. - opts.numThreads = min(len(run.tests), opts.numThreads) + # Don't create more workers than tests. + opts.numWorkers = min(len(run.tests), opts.numWorkers) # Because some tests use threads internally, and at least on Linux each # of these threads counts toward the current process limit, try to @@ -489,7 +489,7 @@ def main_with_tmp(builtinParameters): # resource exhaustion. try: cpus = lit.util.detectCPUs() - desired_limit = opts.numThreads * cpus * 2 # the 2 is a safety factor + desired_limit = opts.numWorkers * cpus * 2 # the 2 is a safety factor # Import the resource module here inside this try block because it # will likely fail on Windows. @@ -506,8 +506,7 @@ def main_with_tmp(builtinParameters): pass extra = (' of %d' % numTotalTests) if (len(run.tests) != numTotalTests) else '' - threads = 'single process' if (opts.numThreads == 1) else ('%d threads' % opts.numThreads) - header = '-- Testing: %d%s tests, %s --' % (len(run.tests), extra, threads) + header = '-- Testing: %d%s tests, %d workers --' % (len(run.tests), extra, opts.numWorkers) progressBar = None if not opts.quiet: if opts.succinct and opts.useProgressBar: @@ -523,7 +522,7 @@ def main_with_tmp(builtinParameters): startTime = time.time() display = TestingProgressDisplay(opts, len(run.tests), progressBar) try: - run.execute_tests(display, opts.numThreads, opts.maxTime) + run.execute_tests(display, opts.numWorkers, opts.maxTime) except KeyboardInterrupt: sys.exit(2) display.finish() diff --git a/llvm/utils/lit/lit/run.py b/llvm/utils/lit/lit/run.py index 18e754addd3280..dbd0822b695e28 100644 --- a/llvm/utils/lit/lit/run.py +++ b/llvm/utils/lit/lit/run.py @@ -37,7 +37,7 @@ def __init__(self, lit_config, tests): multiprocessing.BoundedSemaphore(v) for k, v in lit_config.parallelism_groups.items()} - def execute_tests_in_pool(self, jobs, max_time): + def _execute_tests_in_pool(self, workers, max_time): # We need to issue many wait calls, so compute the final deadline and # subtract time.time() from that as we go along. deadline = None @@ -49,7 +49,7 @@ def execute_tests_in_pool(self, jobs, max_time): # interrupts the workers before we make it into our task callback, they # will each raise a KeyboardInterrupt exception and print to stderr at # the same time. - pool = multiprocessing.Pool(jobs, lit.worker.initializer, + pool = multiprocessing.Pool(workers, lit.worker.initializer, (self.lit_config, self.parallelism_semaphores)) @@ -93,11 +93,11 @@ def console_ctrl_handler(type): finally: pool.join() - def execute_tests(self, display, jobs, max_time=None): + def execute_tests(self, display, workers, max_time=None): """ - execute_tests(display, jobs, [max_time]) + execute_tests(display, workers, [max_time]) - Execute each of the tests in the run, using up to jobs number of + Execute the tests in the run using up to the specified number of parallel tasks, and inform the display of each individual result. The provided tests should be a subset of the tests available in this run object. @@ -105,10 +105,8 @@ def execute_tests(self, display, jobs, max_time=None): If max_time is non-None, it should be a time in seconds after which to stop executing tests. - The display object will have its update method called with each test as - it is completed. The calls are guaranteed to be locked with respect to - one another, but are *not* guaranteed to be called on the same thread as - this method was invoked on. + The display object will have its update method called for each completed + test. Upon completion, each test in the run will have its result computed. Tests which were not actually executed (for any reason) will @@ -124,14 +122,14 @@ def execute_tests(self, display, jobs, max_time=None): self.failure_count = 0 self.hit_max_failures = False - if jobs == 1: + if workers == 1: for test_index, test in enumerate(self.tests): lit.worker._execute_test(test, self.lit_config) self.consume_test_result((test_index, test)) if self.hit_max_failures: break else: - self.execute_tests_in_pool(jobs, max_time) + self._execute_tests_in_pool(workers, max_time) # Mark any tests that weren't run as UNRESOLVED. for test in self.tests: diff --git a/llvm/utils/lit/tests/discovery.py b/llvm/utils/lit/tests/discovery.py index 9f09470c48c571..b15468f10159f5 100644 --- a/llvm/utils/lit/tests/discovery.py +++ b/llvm/utils/lit/tests/discovery.py @@ -29,7 +29,7 @@ # RUN: %{python} %{inputs}/config-map-discovery/driver.py \ # RUN: %{inputs}/config-map-discovery/main-config/lit.cfg \ # RUN: %{inputs}/config-map-discovery/lit.alt.cfg \ -# RUN: --threads=1 --debug --show-tests --show-suites > %t.out 2> %t.err +# RUN: --workers=1 --debug --show-tests --show-suites > %t.out 2> %t.err # RUN: FileCheck --check-prefix=CHECK-CONFIG-MAP-OUT < %t.out %s # RUN: FileCheck --check-prefix=CHECK-CONFIG-MAP-ERR < %t.err %s diff --git a/llvm/utils/lit/tests/parallelism-groups.py b/llvm/utils/lit/tests/parallelism-groups.py index c6427bee124915..d80f2318fe05f9 100644 --- a/llvm/utils/lit/tests/parallelism-groups.py +++ b/llvm/utils/lit/tests/parallelism-groups.py @@ -15,7 +15,7 @@ # RUN: %{lit} -j2 %{inputs}/parallelism-groups | FileCheck %s -# CHECK: -- Testing: 2 tests, 2 threads -- +# CHECK: -- Testing: 2 tests, 2 workers -- # CHECK-DAG: PASS: parallelism-groups :: test1.txt # CHECK-DAG: PASS: parallelism-groups :: test2.txt # CHECK: Expected Passes : 2 diff --git a/llvm/utils/llvm-locstats/CMakeLists.txt b/llvm/utils/llvm-locstats/CMakeLists.txt new file mode 100644 index 00000000000000..a919023e141e97 --- /dev/null +++ b/llvm/utils/llvm-locstats/CMakeLists.txt @@ -0,0 +1,12 @@ +if (LLVM_BUILD_UTILS AND LLVM_BUILD_TOOLS) + add_custom_command( + OUTPUT ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + DEPENDS ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py + COMMAND ${CMAKE_COMMAND} -E copy ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + COMMENT "Copying llvm-locstats into ${LLVM_TOOLS_BINARY_DIR}" + ) + add_custom_target(llvm-locstats ALL + DEPENDS ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + ) + set_target_properties(llvm-locstats PROPERTIES FOLDER "Tools") +endif() diff --git a/llvm/utils/llvm-locstats/llvm-locstats.py b/llvm/utils/llvm-locstats/llvm-locstats.py new file mode 100755 index 00000000000000..22e5ae6b223bbb --- /dev/null +++ b/llvm/utils/llvm-locstats/llvm-locstats.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python +# +# This is a tool that works like debug location coverage calculator. +# It parses the llvm-dwarfdump --statistics output by reporting it +# in a more human readable way. +# + +from __future__ import print_function +import argparse +import os +import sys +from json import loads +from math import ceil +from subprocess import Popen, PIPE + +def coverage_buckets(): + yield '0%' + yield '1-9%' + for start in range(10, 91, 10): + yield '{0}-{1}%'.format(start, start + 9) + yield '100%' + +def locstats_output( + variables_total, + variables_total_locstats, + variables_with_loc, + scope_bytes_covered, + scope_bytes_from_first_def, + variables_coverage_map + ): + + pc_ranges_covered = int(ceil(scope_bytes_covered * 100.0) + / scope_bytes_from_first_def) + variables_coverage_per_map = {} + for cov_bucket in coverage_buckets(): + variables_coverage_per_map[cov_bucket] = \ + int(ceil(variables_coverage_map[cov_bucket] * 100.0) \ + / variables_total_locstats) + + print (' =================================================') + print (' Debug Location Statistics ') + print (' =================================================') + print (' cov% samples percentage(~) ') + print (' -------------------------------------------------') + for cov_bucket in coverage_buckets(): + print (' {0:6} {1:8d} {2:3d}%'. \ + format(cov_bucket, variables_coverage_map[cov_bucket], \ + variables_coverage_per_map[cov_bucket])) + print (' =================================================') + print (' -the number of debug variables processed: ' \ + + str(variables_total_locstats)) + print (' -PC ranges covered: ' + str(pc_ranges_covered) + '%') + + # Only if we are processing all the variables output the total + # availability. + if variables_total and variables_with_loc: + total_availability = int(ceil(variables_with_loc * 100.0) \ + / variables_total) + print (' -------------------------------------------------') + print (' -total availability: ' + str(total_availability) + '%') + print (' =================================================') + +def parse_program_args(parser): + parser.add_argument('-only-variables', action='store_true', + default=False, + help='calculate the location statistics only for ' + 'local variables' + ) + parser.add_argument('-only-formal-parameters', action='store_true', + default=False, + help='calculate the location statistics only for ' + 'formal parameters' + ) + parser.add_argument('-ignore-debug-entry-values', action='store_true', + default=False, + help='ignore the location statistics on locations with ' + 'entry values' + ) + parser.add_argument('file_name', type=str, help='file to process') + return parser.parse_args() + + +def Main(): + parser = argparse.ArgumentParser() + results = parse_program_args(parser) + + if len(sys.argv) < 2: + print ('error: Too few arguments.') + parser.print_help() + sys.exit(1) + + if results.only_variables and results.only_formal_parameters: + print ('error: Please use just one only* option.') + parser.print_help() + sys.exit(1) + + # These will be different due to different options enabled. + variables_total = None + variables_total_locstats = None + variables_with_loc = None + variables_scope_bytes_covered = None + variables_scope_bytes_from_first_def = None + variables_scope_bytes_entry_values = None + variables_coverage_map = {} + binary = results.file_name + + # Get the directory of the LLVM tools. + llvm_dwarfdump_cmd = os.path.join(os.path.dirname(__file__), \ + "llvm-dwarfdump") + # The statistics llvm-dwarfdump option. + llvm_dwarfdump_stats_opt = "--statistics" + + subproc = Popen([llvm_dwarfdump_cmd, llvm_dwarfdump_stats_opt, binary], \ + stdin=PIPE, stdout=PIPE, stderr=PIPE, \ + universal_newlines = True) + cmd_stdout, cmd_stderr = subproc.communicate() + + # Get the JSON and parse it. + json_parsed = None + + try: + json_parsed = loads(cmd_stdout) + except: + print ('error: No valid llvm-dwarfdump statistics found.') + sys.exit(1) + + if results.only_variables: + # Read the JSON only for local variables. + variables_total_locstats = \ + json_parsed['total vars procesed by location statistics'] + variables_scope_bytes_covered = \ + json_parsed['vars scope bytes covered'] + variables_scope_bytes_from_first_def = \ + json_parsed['vars scope bytes total'] + if not results.ignore_debug_entry_values: + for cov_bucket in coverage_buckets(): + cov_category = "vars with {} of its scope covered".format(cov_bucket) + variables_coverage_map[cov_bucket] = json_parsed[cov_category] + else: + variables_scope_bytes_entry_values = \ + json_parsed['vars entry value scope bytes covered'] + variables_scope_bytes_covered = variables_scope_bytes_covered \ + - variables_scope_bytes_entry_values + for cov_bucket in coverage_buckets(): + cov_category = \ + "vars (excluding the debug entry values) " \ + "with {} of its scope covered".format(cov_bucket) + variables_coverage_map[cov_bucket] = json_parsed[cov_category] + elif results.only_formal_parameters: + # Read the JSON only for formal parameters. + variables_total_locstats = \ + json_parsed['total params procesed by location statistics'] + variables_scope_bytes_covered = \ + json_parsed['formal params scope bytes covered'] + variables_scope_bytes_from_first_def = \ + json_parsed['formal params scope bytes total'] + if not results.ignore_debug_entry_values: + for cov_bucket in coverage_buckets(): + cov_category = "params with {} of its scope covered".format(cov_bucket) + variables_coverage_map[cov_bucket] = json_parsed[cov_category] + else: + variables_scope_bytes_entry_values = \ + json_parsed['formal params entry value scope bytes covered'] + variables_scope_bytes_covered = variables_scope_bytes_covered \ + - variables_scope_bytes_entry_values + for cov_bucket in coverage_buckets(): + cov_category = \ + "params (excluding the debug entry values) " \ + "with {} of its scope covered".format(cov_bucket) + else: + # Read the JSON for both local variables and formal parameters. + variables_total = \ + json_parsed['source variables'] + variables_with_loc = json_parsed['variables with location'] + variables_total_locstats = \ + json_parsed['total variables procesed by location statistics'] + variables_scope_bytes_covered = \ + json_parsed['scope bytes covered'] + variables_scope_bytes_from_first_def = \ + json_parsed['scope bytes total'] + if not results.ignore_debug_entry_values: + for cov_bucket in coverage_buckets(): + cov_category = "variables with {} of its scope covered". \ + format(cov_bucket) + variables_coverage_map[cov_bucket] = json_parsed[cov_category] + else: + variables_scope_bytes_entry_values = \ + json_parsed['entry value scope bytes covered'] + variables_scope_bytes_covered = variables_scope_bytes_covered \ + - variables_scope_bytes_entry_values + for cov_bucket in coverage_buckets(): + cov_category = "variables (excluding the debug entry values) " \ + "with {} of its scope covered". format(cov_bucket) + variables_coverage_map[cov_bucket] = json_parsed[cov_category] + + # Pretty print collected info. + locstats_output( + variables_total, + variables_total_locstats, + variables_with_loc, + variables_scope_bytes_covered, + variables_scope_bytes_from_first_def, + variables_coverage_map + ) + +if __name__ == '__main__': + Main() + sys.exit(0) diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py index 5e0d4bac22f074..ec026022fc2f70 100755 --- a/llvm/utils/update_test_checks.py +++ b/llvm/utils/update_test_checks.py @@ -64,6 +64,8 @@ def main(): '--function', help='The function in the test file to update') parser.add_argument('-u', '--update-only', action='store_true', help='Only update test if it was already autogened') + parser.add_argument('-p', '--preserve-names', action='store_true', + help='Do not scrub IR names') parser.add_argument('tests', nargs='+') args = parser.parse_args() @@ -174,7 +176,8 @@ def main(): continue # Print out the various check lines here. - common.add_ir_checks(output_lines, ';', prefix_list, func_dict, func_name) + common.add_ir_checks(output_lines, ';', prefix_list, func_dict, + func_name, args.preserve_names) is_in_function_start = False if is_in_function: