From 8d00fc8a05c693d39945cfa6392fc744acbaeb7b Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Sat, 9 Nov 2024 08:18:46 -0600 Subject: [PATCH] Re-land x86_mmx IR type removal commits cherry-pick: dfeb3991fb48 jyknight@google.com Thu Jul 25 09:19:22 2024 -0400 Remove the `x86_mmx` IR type. (#98505) b7e4fba6e5dc jyknight@google.com Sun Jul 28 18:12:47 2024 -0400 Cleanup x86_mmx after removing IR type (#100646) Change-Id: I987eda387fc403ab249f9d48eeb13fd66606343a --- clang/test/OpenMP/allow-kernelc-io.c | 4 +- llvm/bindings/ocaml/llvm/llvm.mli | 4 - llvm/bindings/ocaml/llvm/llvm_ocaml.c | 5 - llvm/docs/BitCodeFormat.rst | 2 +- llvm/docs/LangRef.rst | 20 +- llvm/docs/ReleaseNotes.rst | 187 ++ llvm/include/llvm-c/Core.h | 48 +- llvm/include/llvm/IR/DataLayout.h | 1 - llvm/include/llvm/IR/Type.h | 12 +- llvm/lib/Analysis/ConstantFolding.cpp | 8 +- llvm/lib/AsmParser/LLLexer.cpp | 1 - llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 4 +- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 5 +- llvm/lib/CodeGen/ValueTypes.cpp | 7 +- llvm/lib/IR/AsmWriter.cpp | 5 +- llvm/lib/IR/ConstantFold.cpp | 2 +- llvm/lib/IR/Core.cpp | 8 - llvm/lib/IR/DataLayout.cpp | 1 - llvm/lib/IR/Instructions.cpp | 9 - llvm/lib/IR/Intrinsics.cpp | 11 +- llvm/lib/IR/LLVMContextImpl.cpp | 6 +- llvm/lib/IR/LLVMContextImpl.h | 2 +- llvm/lib/IR/Type.cpp | 15 +- .../DirectX/DXILWriter/DXILBitcodeWriter.cpp | 3 - .../Hexagon/HexagonTargetObjectFile.cpp | 1 - llvm/lib/Target/X86/X86CallingConv.td | 34 - llvm/lib/Target/X86/X86ISelLowering.cpp | 92 +- .../Target/X86/X86InstCombineIntrinsic.cpp | 8 +- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 96 +- .../IPO/DeadArgumentElimination.cpp | 6 +- .../InstCombine/InstCombineCasts.cpp | 7 - .../Instrumentation/MemorySanitizer.cpp | 50 +- llvm/test/Assembler/x86mmx.ll | 9 - llvm/test/Bindings/llvm-c/echo.ll | 2 +- llvm/test/Bitcode/bcanalyzer-types.ll | 6 - llvm/test/Bitcode/compatibility-3.6.ll | 2 +- llvm/test/Bitcode/compatibility-3.7.ll | 2 +- llvm/test/Bitcode/compatibility-3.8.ll | 2 +- llvm/test/Bitcode/compatibility-3.9.ll | 2 +- llvm/test/Bitcode/compatibility-4.0.ll | 2 +- llvm/test/Bitcode/compatibility-5.0.ll | 2 +- llvm/test/Bitcode/compatibility-6.0.ll | 2 +- llvm/test/Bitcode/compatibility.ll | 2 - llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll | 6 +- .../test/CodeGen/X86/2007-07-03-GR64ToVR64.ll | 10 +- .../CodeGen/X86/2008-04-08-CoalescerCrash.ll | 8 +- .../CodeGen/X86/2008-08-23-64Bit-maskmovq.ll | 8 +- .../CodeGen/X86/2008-09-05-sinttofp-2xi32.ll | 35 +- .../CodeGen/X86/2011-06-14-mmx-inlineasm.ll | 8 +- llvm/test/CodeGen/X86/avx-vbroadcast.ll | 12 +- llvm/test/CodeGen/X86/avx2-vbroadcast.ll | 15 +- llvm/test/CodeGen/X86/bitcast-mmx.ll | 34 +- .../CodeGen/X86/expand-vr64-gr64-copy.mir | 6 +- llvm/test/CodeGen/X86/fake-use-vector.ll | 1 - llvm/test/CodeGen/X86/fast-isel-bc.ll | 15 +- .../test/CodeGen/X86/fast-isel-nontemporal.ll | 8 +- .../CodeGen/X86/mmx-arg-passing-x86-64.ll | 29 +- llvm/test/CodeGen/X86/mmx-arg-passing.ll | 21 +- llvm/test/CodeGen/X86/mmx-arith.ll | 380 +-- llvm/test/CodeGen/X86/mmx-bitcast-fold.ll | 8 +- llvm/test/CodeGen/X86/mmx-bitcast.ll | 54 +- llvm/test/CodeGen/X86/mmx-build-vector.ll | 134 +- llvm/test/CodeGen/X86/mmx-coalescing.ll | 28 +- llvm/test/CodeGen/X86/mmx-cvt.ll | 160 +- llvm/test/CodeGen/X86/mmx-fold-load.ll | 275 +- llvm/test/CodeGen/X86/mmx-fold-zero.ll | 52 +- llvm/test/CodeGen/X86/mmx-intrinsics.ll | 1263 ++++---- llvm/test/CodeGen/X86/mmx-only.ll | 10 +- llvm/test/CodeGen/X86/mxcsr-reg-usage.ll | 24 +- llvm/test/CodeGen/X86/nontemporal.ll | 8 +- llvm/test/CodeGen/X86/pr13859.ll | 5 +- llvm/test/CodeGen/X86/pr23246.ll | 7 +- llvm/test/CodeGen/X86/pr29222.ll | 10 +- llvm/test/CodeGen/X86/pr35982.ll | 8 +- llvm/test/CodeGen/X86/select-mmx.ll | 53 +- llvm/test/CodeGen/X86/stack-folding-mmx.ll | 1288 +++++---- llvm/test/CodeGen/X86/vec_extract-mmx.ll | 43 +- llvm/test/CodeGen/X86/vec_insert-5.ll | 4 +- llvm/test/CodeGen/X86/vec_insert-7.ll | 11 +- llvm/test/CodeGen/X86/vec_insert-mmx.ll | 14 +- llvm/test/CodeGen/X86/vector-shuffle-mmx.ll | 30 +- llvm/test/CodeGen/X86/x86-64-psub.ll | 70 +- .../MemorySanitizer/X86/mmx-intrinsics.ll | 2543 +++++++++-------- .../MemorySanitizer/i386/mmx-intrinsics.ll | 1 - .../MemorySanitizer/vector_arith.ll | 28 +- .../MemorySanitizer/vector_cvt.ll | 16 +- .../MemorySanitizer/vector_pack.ll | 23 +- .../MemorySanitizer/vector_shift.ll | 20 +- .../X86/x86-GCC-inline-asm-Y-constraints.ll | 2 +- .../Transforms/InstCombine/X86/x86-movmsk.ll | 30 +- .../bitcast-vec-canon-inseltpoison.ll | 45 - .../InstCombine/bitcast-vec-canon.ll | 44 - llvm/test/Transforms/InstCombine/cast.ll | 21 - .../ConstProp/gep-zeroinit-vector.ll | 15 +- .../InstSimplify/ConstProp/loads.ll | 13 - llvm/test/Transforms/LoopUnroll/X86/mmx.ll | 35 - llvm/test/Transforms/SCCP/crash.ll | 6 +- llvm/test/Transforms/SROA/pr57796.ll | 14 +- llvm/test/Verifier/atomics.ll | 10 +- llvm/tools/llvm-c-test/echo.cpp | 2 - llvm/tools/llvm-stress/llvm-stress.cpp | 8 +- llvm/unittests/IR/InstructionsTest.cpp | 9 +- mlir/docs/Dialects/LLVM.md | 2 - mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h | 1 - mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 1 - mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp | 2 - mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp | 2 - mlir/lib/Target/LLVMIR/TypeToLLVM.cpp | 3 - offload/hostexec/services/execute_service.cpp | 3 - revert_patches.txt | 5 - 110 files changed, 3936 insertions(+), 3820 deletions(-) create mode 100644 llvm/docs/ReleaseNotes.rst delete mode 100644 llvm/test/Assembler/x86mmx.ll delete mode 100644 llvm/test/Transforms/LoopUnroll/X86/mmx.ll diff --git a/clang/test/OpenMP/allow-kernelc-io.c b/clang/test/OpenMP/allow-kernelc-io.c index fcdf6c2f575e02..934fb294a2cd73 100644 --- a/clang/test/OpenMP/allow-kernelc-io.c +++ b/clang/test/OpenMP/allow-kernelc-io.c @@ -48,7 +48,7 @@ int main(void) { // CHECK-NOPE-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 1 // CHECK-NOPE-NEXT: store i32 1, ptr addrspace(1) [[TMP3]], align 4 // CHECK-NOPE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 2 -// CHECK-NOPE-NEXT: store i32 983041, ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NOPE-NEXT: store i32 917505, ptr addrspace(1) [[TMP4]], align 4 // CHECK-NOPE-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 3 // CHECK-NOPE-NEXT: store i32 11, ptr addrspace(1) [[TMP5]], align 4 // CHECK-NOPE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i64 16 @@ -78,7 +78,7 @@ int main(void) { // CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 1 // CHECK-NEXT: store i32 1, ptr addrspace(1) [[TMP3]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 2 -// CHECK-NEXT: store i32 983041, ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: store i32 917505, ptr addrspace(1) [[TMP4]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 3 // CHECK-NEXT: store i32 11, ptr addrspace(1) [[TMP5]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i64 16 diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli index 25ce72e846d001..17cad1f43888be 100644 --- a/llvm/bindings/ocaml/llvm/llvm.mli +++ b/llvm/bindings/ocaml/llvm/llvm.mli @@ -766,10 +766,6 @@ val void_type : llcontext -> lltype [llvm::Type::LabelTy]. *) val label_type : llcontext -> lltype -(** [x86_mmx_type c] returns the x86 64-bit MMX register type in the - context [c]. See [llvm::Type::X86_MMXTy]. *) -val x86_mmx_type : llcontext -> lltype - (** [type_by_name m name] returns the specified type from the current module if it exists. See the method [llvm::Module::getTypeByName] *) diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c index 4ac824cd6a98a6..5906f427e69072 100644 --- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c +++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c @@ -686,11 +686,6 @@ value llvm_label_type(value Context) { return to_val(LLVMLabelTypeInContext(Context_val(Context))); } -/* llcontext -> lltype */ -value llvm_x86_mmx_type(value Context) { - return to_val(LLVMX86MMXTypeInContext(Context_val(Context))); -} - /* llmodule -> string -> lltype option */ value llvm_type_by_name(value M, value Name) { return ptr_to_option(LLVMGetTypeByName(Module_val(M), String_val(Name))); diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst index 89933e3fc00502..8a26b101c4bf8e 100644 --- a/llvm/docs/BitCodeFormat.rst +++ b/llvm/docs/BitCodeFormat.rst @@ -1227,7 +1227,7 @@ TYPE_CODE_X86_MMX Record ``[X86_MMX]`` -The ``X86_MMX`` record (code 17) adds an ``x86_mmx`` type to the type table. +The ``X86_MMX`` record (code 17) is deprecated, and imported as a <1 x i64> vector. TYPE_CODE_STRUCT_ANON Record ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index df1002c51a4e7b..8bcb22bf72cc47 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4060,24 +4060,6 @@ or constants of this type. x86_amx -X86_mmx Type -"""""""""""" - -:Overview: - -The x86_mmx type represents a value held in an MMX register on an x86 -machine. The operations allowed on it are quite limited: parameters and -return values, load and store, and bitcast. User-specified MMX -instructions are represented as intrinsic or asm calls with arguments -and/or results of this type. There are no arrays, vectors or constants -of this type. - -:Syntax: - -:: - - x86_mmx - .. _t_pointer: @@ -4511,7 +4493,7 @@ represented by ``0xH`` followed by 4 hexadecimal digits. The bfloat 16-bit format is represented by ``0xR`` followed by 4 hexadecimal digits. All hexadecimal formats are big-endian (sign bit at the left). -There are no constants of type x86_mmx and x86_amx. +There are no constants of type x86_amx. .. _complexconstants: diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst new file mode 100644 index 00000000000000..551a9bec3b9161 --- /dev/null +++ b/llvm/docs/ReleaseNotes.rst @@ -0,0 +1,187 @@ +============================ +LLVM |release| Release Notes +============================ + +.. contents:: + :local: + +.. only:: PreRelease + + .. warning:: + These are in-progress notes for the upcoming LLVM |version| release. + Release notes for previous releases can be found on + `the Download Page `_. + + +Introduction +============ + +This document contains the release notes for the LLVM Compiler Infrastructure, +release |release|. Here we describe the status of LLVM, including major improvements +from the previous release, improvements in various subprojects of LLVM, and +some of the current users of the code. All LLVM releases may be downloaded +from the `LLVM releases web site `_. + +For more information about LLVM, including information about the latest +release, please check out the `main LLVM web site `_. If you +have questions or comments, the `Discourse forums +`_ is a good place to ask +them. + +Note that if you are reading this file from a Git checkout or the main +LLVM web page, this document applies to the *next* release, not the current +one. To see the release notes for a specific release, please see the `releases +page `_. + +Non-comprehensive list of changes in this release +================================================= +.. NOTE + For small 1-3 sentence descriptions, just add an entry at the end of + this list. If your description won't fit comfortably in one bullet + point (e.g. maybe you would like to give an example of the + functionality, or simply have a lot to talk about), see the `NOTE` below + for adding a new subsection. + +* ... + +Update on required toolchains to build LLVM +------------------------------------------- + +Changes to the LLVM IR +---------------------- + +* The ``x86_mmx`` IR type has been removed. It will be translated to + the standard vector type ``<1 x i64>`` in bitcode upgrade. + +Changes to LLVM infrastructure +------------------------------ + +Changes to building LLVM +------------------------ + +Changes to TableGen +------------------- + +Changes to Interprocedural Optimizations +---------------------------------------- + +Changes to the AArch64 Backend +------------------------------ + +* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill + the required alignment space with a sequence of `0x0` bytes (the requested + fill value) rather than NOPs. + +Changes to the AMDGPU Backend +----------------------------- + +Changes to the ARM Backend +-------------------------- + +* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill + the required alignment space with a sequence of `0x0` bytes (the requested + fill value) rather than NOPs. + +Changes to the AVR Backend +-------------------------- + +Changes to the DirectX Backend +------------------------------ + +Changes to the Hexagon Backend +------------------------------ + +Changes to the LoongArch Backend +-------------------------------- + +Changes to the MIPS Backend +--------------------------- + +Changes to the PowerPC Backend +------------------------------ + +Changes to the RISC-V Backend +----------------------------- + +* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill + the required alignment space with a sequence of `0x0` bytes (the requested + fill value) rather than NOPs. + +Changes to the WebAssembly Backend +---------------------------------- + +Changes to the Windows Target +----------------------------- + +Changes to the X86 Backend +-------------------------- + +* `.balign N, 0x90`, `.p2align N, 0x90`, and `.align N, 0x90` in code sections + now fill the required alignment space with repeating `0x90` bytes, rather than + using optimised NOP filling. Optimised NOP filling fills the space with NOP + instructions of various widths, not just those that use the `0x90` byte + encoding. To use optimised NOP filling in a code section, leave off the + "fillval" argument, i.e. `.balign N`, `.p2align N` or `.align N` respectively. + +* Due to the removal of the ``x86_mmx`` IR type, functions with + ``x86_mmx`` arguments or return values will use a different, + incompatible, calling convention ABI. Such functions are not + generally seen in the wild (Clang never generates them!), so this is + not expected to result in real-world compatibility problems. + +Changes to the OCaml bindings +----------------------------- + +Changes to the Python bindings +------------------------------ + +Changes to the C API +-------------------- + +* The following symbols are deleted due to the removal of the ``x86_mmx`` IR type: + + * ``LLVMX86_MMXTypeKind`` + * ``LLVMX86MMXTypeInContext`` + * ``LLVMX86MMXType`` + +Changes to the CodeGen infrastructure +------------------------------------- + +Changes to the Metadata Info +--------------------------------- + +Changes to the Debug Info +--------------------------------- + +Changes to the LLVM tools +--------------------------------- + +Changes to LLDB +--------------------------------- + +Changes to BOLT +--------------------------------- + +Changes to Sanitizers +--------------------- + +Other Changes +------------- + +External Open Source Projects Using LLVM 19 +=========================================== + +* A project... + +Additional Information +====================== + +A wide variety of additional information is available on the `LLVM web page +`_, in particular in the `documentation +`_ section. The web page also contains versions of the +API documentation which is up-to-date with the Git version of the source +code. You can access versions of these documents specific to this release by +going into the ``llvm/docs/`` directory in the LLVM tree. + +If you have any questions or comments about LLVM, please feel free to contact +us via the `Discourse forums `_. diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 978ecb0b686676..dc8ecf4fb2ade2 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -146,27 +146,27 @@ typedef enum { } LLVMOpcode; typedef enum { - LLVMVoidTypeKind, /**< type with no size */ - LLVMHalfTypeKind, /**< 16 bit floating point type */ - LLVMFloatTypeKind, /**< 32 bit floating point type */ - LLVMDoubleTypeKind, /**< 64 bit floating point type */ - LLVMX86_FP80TypeKind, /**< 80 bit floating point type (X87) */ - LLVMFP128TypeKind, /**< 128 bit floating point type (112-bit mantissa)*/ - LLVMPPC_FP128TypeKind, /**< 128 bit floating point type (two 64-bits) */ - LLVMLabelTypeKind, /**< Labels */ - LLVMIntegerTypeKind, /**< Arbitrary bit width integers */ - LLVMFunctionTypeKind, /**< Functions */ - LLVMStructTypeKind, /**< Structures */ - LLVMArrayTypeKind, /**< Arrays */ - LLVMPointerTypeKind, /**< Pointers */ - LLVMVectorTypeKind, /**< Fixed width SIMD vector type */ - LLVMMetadataTypeKind, /**< Metadata */ - LLVMX86_MMXTypeKind, /**< X86 MMX */ - LLVMTokenTypeKind, /**< Tokens */ - LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */ - LLVMBFloatTypeKind, /**< 16 bit brain floating point type */ - LLVMX86_AMXTypeKind, /**< X86 AMX */ - LLVMTargetExtTypeKind, /**< Target extension type */ + LLVMVoidTypeKind = 0, /**< type with no size */ + LLVMHalfTypeKind = 1, /**< 16 bit floating point type */ + LLVMFloatTypeKind = 2, /**< 32 bit floating point type */ + LLVMDoubleTypeKind = 3, /**< 64 bit floating point type */ + LLVMX86_FP80TypeKind = 4, /**< 80 bit floating point type (X87) */ + LLVMFP128TypeKind = 5, /**< 128 bit floating point type (112-bit mantissa)*/ + LLVMPPC_FP128TypeKind = 6, /**< 128 bit floating point type (two 64-bits) */ + LLVMLabelTypeKind = 7, /**< Labels */ + LLVMIntegerTypeKind = 8, /**< Arbitrary bit width integers */ + LLVMFunctionTypeKind = 9, /**< Functions */ + LLVMStructTypeKind = 10, /**< Structures */ + LLVMArrayTypeKind = 11, /**< Arrays */ + LLVMPointerTypeKind = 12, /**< Pointers */ + LLVMVectorTypeKind = 13, /**< Fixed width SIMD vector type */ + LLVMMetadataTypeKind = 14, /**< Metadata */ + /* 15 previously used by LLVMX86_MMXTypeKind */ + LLVMTokenTypeKind = 16, /**< Tokens */ + LLVMScalableVectorTypeKind = 17, /**< Scalable SIMD vector type */ + LLVMBFloatTypeKind = 18, /**< 16 bit brain floating point type */ + LLVMX86_AMXTypeKind = 19, /**< X86 AMX */ + LLVMTargetExtTypeKind = 20, /**< Target extension type */ } LLVMTypeKind; typedef enum { @@ -1734,11 +1734,6 @@ LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C); */ LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C); -/** - * Create a X86 MMX type in a context. - */ -LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C); - /** * Create a X86 AMX type in a context. */ @@ -1760,7 +1755,6 @@ LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C); */ LLVMTypeRef LLVMVoidType(void); LLVMTypeRef LLVMLabelType(void); -LLVMTypeRef LLVMX86MMXType(void); LLVMTypeRef LLVMX86AMXType(void); /** diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h index 829249a6fbe38b..93bd519f5727d8 100644 --- a/llvm/include/llvm/IR/DataLayout.h +++ b/llvm/include/llvm/IR/DataLayout.h @@ -646,7 +646,6 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const { case Type::FloatTyID: return TypeSize::getFixed(32); case Type::DoubleTyID: - case Type::X86_MMXTyID: return TypeSize::getFixed(64); case Type::PPC_FP128TyID: case Type::FP128TyID: diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h index f03684372d3877..7fa940ab347af6 100644 --- a/llvm/include/llvm/IR/Type.h +++ b/llvm/include/llvm/IR/Type.h @@ -63,7 +63,6 @@ class Type { VoidTyID, ///< type with no size LabelTyID, ///< Labels MetadataTyID, ///< Metadata - X86_MMXTyID, ///< MMX vectors (64 bits, X86 specific) X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific) TokenTyID, ///< Tokens @@ -197,9 +196,6 @@ class Type { const fltSemantics &getFltSemantics() const; - /// Return true if this is X86 MMX. - bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; } - /// Return true if this is X86 AMX. bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; } @@ -285,8 +281,8 @@ class Type { /// Return true if the type is a valid type for a register in codegen. This /// includes all first-class types except struct and array types. bool isSingleValueType() const { - return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() || - isPointerTy() || isVectorTy() || isX86_AMXTy() || isTargetExtTy(); + return isFloatingPointTy() || isIntegerTy() || isPointerTy() || + isVectorTy() || isX86_AMXTy() || isTargetExtTy(); } /// Return true if the type is an aggregate type. This means it is valid as @@ -302,8 +298,7 @@ class Type { bool isSized(SmallPtrSetImpl *Visited = nullptr) const { // If it's a primitive, it is always sized. if (getTypeID() == IntegerTyID || isFloatingPointTy() || - getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID || - getTypeID() == X86_AMXTyID) + getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID) return true; // If it is not something that can have a size (e.g. a function or label), // it doesn't have a size. @@ -445,7 +440,6 @@ class Type { static Type *getX86_FP80Ty(LLVMContext &C); static Type *getFP128Ty(LLVMContext &C); static Type *getPPC_FP128Ty(LLVMContext &C); - static Type *getX86_MMXTy(LLVMContext &C); static Type *getX86_AMXTy(LLVMContext &C); static Type *getTokenTy(LLVMContext &C); static IntegerType *getIntNTy(LLVMContext &C, unsigned N); diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 5f6df3fa06592e..88db315ffd0bcb 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -565,16 +565,14 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy, Type *MapTy = Type::getIntNTy(C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedValue()); if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) { - if (Res->isNullValue() && !LoadTy->isX86_MMXTy() && - !LoadTy->isX86_AMXTy()) + if (Res->isNullValue() && !LoadTy->isX86_AMXTy()) // Materializing a zero can be done trivially without a bitcast return Constant::getNullValue(LoadTy); Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy; Res = FoldBitCast(Res, CastTy, DL); if (LoadTy->isPtrOrPtrVectorTy()) { // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr - if (Res->isNullValue() && !LoadTy->isX86_MMXTy() && - !LoadTy->isX86_AMXTy()) + if (Res->isNullValue() && !LoadTy->isX86_AMXTy()) return Constant::getNullValue(LoadTy); if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) // Be careful not to replace a load of an addrspace value with an inttoptr here @@ -765,7 +763,7 @@ Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty, // uniform. if (!DL.typeSizeEqualsStoreSize(C->getType())) return nullptr; - if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) + if (C->isNullValue() && !Ty->isX86_AMXTy()) return Constant::getNullValue(Ty); if (C->isAllOnesValue() && (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy())) diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 65343c9630384d..ae3c89ab1acc36 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -881,7 +881,6 @@ lltok::Kind LLLexer::LexIdentifier() { TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context)); TYPEKEYWORD("label", Type::getLabelTy(Context)); TYPEKEYWORD("metadata", Type::getMetadataTy(Context)); - TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context)); TYPEKEYWORD("x86_amx", Type::getX86_AMXTy(Context)); TYPEKEYWORD("token", Type::getTokenTy(Context)); TYPEKEYWORD("ptr", PointerType::getUnqual(Context)); diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index f98d4751d3f2ea..78fb92dc44494f 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2528,7 +2528,9 @@ Error BitcodeReader::parseTypeTableBody() { ResultTy = Type::getMetadataTy(Context); break; case bitc::TYPE_CODE_X86_MMX: // X86_MMX - ResultTy = Type::getX86_MMXTy(Context); + // Deprecated: decodes as <1 x i64> + ResultTy = + llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); break; case bitc::TYPE_CODE_X86_AMX: // X86_AMX ResultTy = Type::getX86_AMXTy(Context); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 9bda3d282a5bb7..1f62bc8ecfa106 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1117,8 +1117,9 @@ void ModuleBitcodeWriter::writeTypeTable() { case Type::FP128TyID: Code = bitc::TYPE_CODE_FP128; break; case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break; case Type::LabelTyID: Code = bitc::TYPE_CODE_LABEL; break; - case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break; - case Type::X86_MMXTyID: Code = bitc::TYPE_CODE_X86_MMX; break; + case Type::MetadataTyID: + Code = bitc::TYPE_CODE_METADATA; + break; case Type::X86_AMXTyID: Code = bitc::TYPE_CODE_X86_AMX; break; case Type::TokenTyID: Code = bitc::TYPE_CODE_TOKEN; break; case Type::IntegerTyID: diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index b10c58babf93fc..e3c746b274dde1 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -214,7 +214,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { assert(isExtended() && "Type is not extended!"); return LLVMTy; case MVT::isVoid: return Type::getVoidTy(Context); - case MVT::x86mmx: return Type::getX86_MMXTy(Context); + case MVT::x86mmx: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); case MVT::aarch64svcount: return TargetExtType::get(Context, "aarch64.svcount"); case MVT::x86amx: return Type::getX86_AMXTy(Context); @@ -248,8 +248,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){ case Type::BFloatTyID: return MVT(MVT::bf16); case Type::FloatTyID: return MVT(MVT::f32); case Type::DoubleTyID: return MVT(MVT::f64); - case Type::X86_FP80TyID: return MVT(MVT::f80); - case Type::X86_MMXTyID: return MVT(MVT::x86mmx); + case Type::X86_FP80TyID: + return MVT(MVT::f80); case Type::TargetExtTyID: { TargetExtType *TargetExtTy = cast(Ty); if (TargetExtTy->getName() == "aarch64.svcount") @@ -334,4 +334,3 @@ void MVT::print(raw_ostream &OS) const { else OS << EVT(*this).getEVTString(); } - diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index ad65e33119b85f..da4b9ec87a6f46 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -572,8 +572,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) { case Type::FP128TyID: OS << "fp128"; return; case Type::PPC_FP128TyID: OS << "ppc_fp128"; return; case Type::LabelTyID: OS << "label"; return; - case Type::MetadataTyID: OS << "metadata"; return; - case Type::X86_MMXTyID: OS << "x86_mmx"; return; + case Type::MetadataTyID: + OS << "metadata"; + return; case Type::X86_AMXTyID: OS << "x86_amx"; return; case Type::TokenTyID: OS << "token"; return; case Type::IntegerTyID: diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index d3566c7ddf66b9..cfe87937c372cd 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -142,7 +142,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, return UndefValue::get(DestTy); } - if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() && + if (V->isNullValue() && !DestTy->isX86_AMXTy() && opc != Instruction::AddrSpaceCast) return Constant::getNullValue(DestTy); diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index e2b91f3781f03b..b497d7cc829d3a 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -617,8 +617,6 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) { return LLVMPointerTypeKind; case Type::FixedVectorTyID: return LLVMVectorTypeKind; - case Type::X86_MMXTyID: - return LLVMX86_MMXTypeKind; case Type::X86_AMXTyID: return LLVMX86_AMXTypeKind; case Type::TokenTyID: @@ -733,9 +731,6 @@ LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) { LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C)); } -LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) { - return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C)); -} LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C)); } @@ -761,9 +756,6 @@ LLVMTypeRef LLVMFP128Type(void) { LLVMTypeRef LLVMPPCFP128Type(void) { return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext()); } -LLVMTypeRef LLVMX86MMXType(void) { - return LLVMX86MMXTypeInContext(LLVMGetGlobalContext()); -} LLVMTypeRef LLVMX86AMXType(void) { return LLVMX86AMXTypeInContext(LLVMGetGlobalContext()); } diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index 7c405a18abbc56..a4af0ead07cf61 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -823,7 +823,6 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const { // layout. return Align(PowerOf2Ceil(BitWidth / 8)); } - case Type::X86_MMXTyID: case Type::FixedVectorTyID: case Type::ScalableVectorTyID: { unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinValue(); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 19f51b06ab184a..05e340ffa20a07 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -3107,9 +3107,6 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) { if (SrcBits != DestBits) return false; - if (DestTy->isX86_MMXTy() || SrcTy->isX86_MMXTy()) - return false; - return true; } @@ -3219,12 +3216,6 @@ CastInst::getCastOpcode( return IntToPtr; // int -> ptr } llvm_unreachable("Casting pointer to other than pointer or int"); - } else if (DestTy->isX86_MMXTy()) { - if (SrcTy->isVectorTy()) { - assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX"); - return BitCast; // 64-bit vector to MMX - } - llvm_unreachable("Illegal cast to X86_MMX"); } llvm_unreachable("Casting to type that is not first-class"); } diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index 46df6845ff3ab7..3130a0bd2955a5 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -140,9 +140,6 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) { case Type::PPC_FP128TyID: Result += "ppcf128"; break; - case Type::X86_MMXTyID: - Result += "x86mmx"; - break; case Type::X86_AMXTyID: Result += "x86amx"; break; @@ -497,7 +494,7 @@ static Type *DecodeFixedType(ArrayRef &Infos, case IITDescriptor::VarArg: return Type::getVoidTy(Context); case IITDescriptor::MMX: - return Type::getX86_MMXTy(Context); + return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); case IITDescriptor::AMX: return Type::getX86_AMXTy(Context); case IITDescriptor::Token: @@ -797,7 +794,11 @@ matchIntrinsicType(Type *Ty, ArrayRef &Infos, return !Ty->isVoidTy(); case IITDescriptor::VarArg: return true; - case IITDescriptor::MMX: return !Ty->isX86_MMXTy(); + case IITDescriptor::MMX: { + FixedVectorType *VT = dyn_cast(Ty); + return !VT || VT->getNumElements() != 1 || + !VT->getElementType()->isIntegerTy(64); + } case IITDescriptor::AMX: return !Ty->isX86_AMXTy(); case IITDescriptor::Token: diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp index 01024d27444909..f42c15c8f749a5 100644 --- a/llvm/lib/IR/LLVMContextImpl.cpp +++ b/llvm/lib/IR/LLVMContextImpl.cpp @@ -37,9 +37,9 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C) FloatTy(C, Type::FloatTyID), DoubleTy(C, Type::DoubleTyID), MetadataTy(C, Type::MetadataTyID), TokenTy(C, Type::TokenTyID), X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID), - PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID), - X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8), - Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {} + PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_AMXTy(C, Type::X86_AMXTyID), + Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32), + Int64Ty(C, 64), Int128Ty(C, 128) {} LLVMContextImpl::~LLVMContextImpl() { #ifndef NDEBUG diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index 0936611cab1d83..4ce42904a69d46 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -1617,7 +1617,7 @@ class LLVMContextImpl { // Basic type instances. Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy, TokenTy; - Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy; + Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_AMXTy; IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty; std::unique_ptr TheNoneToken; diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index cb91b973aabc73..151f36c91e0d93 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -47,8 +47,8 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) { case FP128TyID : return getFP128Ty(C); case PPC_FP128TyID : return getPPC_FP128Ty(C); case LabelTyID : return getLabelTy(C); - case MetadataTyID : return getMetadataTy(C); - case X86_MMXTyID : return getX86_MMXTy(C); + case MetadataTyID: + return getMetadataTy(C); case X86_AMXTyID : return getX86_AMXTy(C); case TokenTyID : return getTokenTy(C); default: @@ -138,14 +138,6 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const { if (isa(this) && isa(Ty)) return getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits(); - // 64-bit fixed width vector types can be losslessly converted to x86mmx. - if (((isa(this)) && Ty->isX86_MMXTy()) && - getPrimitiveSizeInBits().getFixedValue() == 64) - return true; - if ((isX86_MMXTy() && isa(Ty)) && - Ty->getPrimitiveSizeInBits().getFixedValue() == 64) - return true; - // 8192-bit fixed width vector types can be losslessly converted to x86amx. if (((isa(this)) && Ty->isX86_AMXTy()) && getPrimitiveSizeInBits().getFixedValue() == 8192) @@ -192,8 +184,6 @@ TypeSize Type::getPrimitiveSizeInBits() const { return TypeSize::getFixed(128); case Type::PPC_FP128TyID: return TypeSize::getFixed(128); - case Type::X86_MMXTyID: - return TypeSize::getFixed(64); case Type::X86_AMXTyID: return TypeSize::getFixed(8192); case Type::IntegerTyID: @@ -258,7 +248,6 @@ Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; } Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; } Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; } Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; } -Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; } Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; } IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; } diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 0be978e41350c1..45aadac861946b 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1048,9 +1048,6 @@ void DXILBitcodeWriter::writeTypeTable() { case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break; - case Type::X86_MMXTyID: - Code = bitc::TYPE_CODE_X86_MMX; - break; case Type::IntegerTyID: // INTEGER: [width] Code = bitc::TYPE_CODE_INTEGER; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index e5d10a75728bf8..0c1b0aea41f41f 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -329,7 +329,6 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty, case Type::PPC_FP128TyID: case Type::LabelTyID: case Type::MetadataTyID: - case Type::X86_MMXTyID: case Type::X86_AMXTyID: case Type::TokenTyID: case Type::TypedPointerTyID: diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index ddbc267c151157..91af111db8cda5 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -168,10 +168,6 @@ def CC_#NAME : CallingConv<[ CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64], CCAssignToStack<8, 4>>, - // MMX type gets 8 byte slot in stack , while alignment depends on target - CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>, - CCIfType<[x86mmx], CCAssignToStack<8, 4>>, - // float 128 get stack slots whose size and alignment depends // on the subtarget. CCIfType<[f80, f128], CCAssignToStack<0, 0>>, @@ -286,10 +282,6 @@ def RetCC_X86Common : CallingConv<[ CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, - // MMX vector types are always returned in MM0. If the target doesn't have - // MM0, it doesn't support these vector types. - CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, - // Long double types are always returned in FP0 (even with SSE), // except on Win64. CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>> @@ -376,9 +368,6 @@ def RetCC_X86_64_C : CallingConv<[ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, - // MMX vector types are always returned in XMM0. - CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, - // Pointers are always returned in full 64-bit registers. CCIfPtr>, @@ -389,9 +378,6 @@ def RetCC_X86_64_C : CallingConv<[ // X86-Win64 C return-value convention. def RetCC_X86_Win64_C : CallingConv<[ - // The X86-Win64 calling convention always returns __m64 values in RAX. - CCIfType<[x86mmx], CCBitConvertToType>, - // GCC returns FP values in RAX on Win64. CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, @@ -436,8 +422,6 @@ def RetCC_X86_64_Swift : CallingConv<[ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, - // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3. - CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, CCDelegateTo ]>; @@ -572,12 +556,6 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, - // The first 8 MMX vector arguments are passed in XMM registers on Darwin. - CCIfType<[x86mmx], - CCIfSubtarget<"isTargetDarwin()", - CCIfSubtarget<"hasSSE2()", - CCPromoteToType>>>, - // Boolean vectors of AVX-512 are passed in SIMD registers. // The call from AVX to AVX-512 function should work, // since the boolean types in AVX/AVX2 are promoted by default. @@ -666,9 +644,6 @@ def CC_X86_Win64_C : CallingConv<[ // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, - // The first 4 MMX vector arguments are passed in GPRs. - CCIfType<[x86mmx], CCBitConvertToType>, - // If SSE was disabled, pass FP values smaller than 64-bits as integers in // GPRs or on the stack. CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, @@ -843,11 +818,6 @@ def CC_X86_32_Common : CallingConv<[ CCIfNotVarArg>>>, - // The first 3 __m64 vector arguments are passed in mmx registers if the - // call is not a vararg call. - CCIfNotVarArg>>, - CCIfType<[f16], CCAssignToStack<4, 4>>, // Integer/Float values get stored in stack slots that are 4 bytes in @@ -870,10 +840,6 @@ def CC_X86_32_Common : CallingConv<[ CCIfType<[v32i1], CCPromoteToType>, CCIfType<[v64i1], CCPromoteToType>, - // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are - // passed in the parameter area. - CCIfType<[x86mmx], CCAssignToStack<8, 4>>, - // Darwin passes vectors in a form that differs from the i386 psABI CCIfSubtarget<"isTargetDarwin()", CCDelegateTo>, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 713f4fb7d4cc14..91e48f1e77db12 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2649,7 +2649,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND, ISD::FP_ROUND, - ISD::STRICT_FP_ROUND}); + ISD::STRICT_FP_ROUND, + ISD::INTRINSIC_VOID, + ISD::INTRINSIC_WO_CHAIN, + ISD::INTRINSIC_W_CHAIN}); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -27652,6 +27655,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, llvm_unreachable("Unsupported truncstore intrinsic"); } } + case INTR_TYPE_CAST_MMX: + return SDValue(); // handled in combineINTRINSIC_* } } @@ -58616,6 +58621,86 @@ static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>, +// and so SelectionDAGBuilder creates them with v1i64 types, but they need to +// use x86mmx instead. +static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + + bool MadeChange = false, CastReturnVal = false; + SmallVector Args; + for (const SDValue &Arg : N->op_values()) { + if (Arg.getValueType() == MVT::v1i64) { + MadeChange = true; + Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg)); + } else + Args.push_back(Arg); + } + SDVTList VTs = N->getVTList(); + SDVTList NewVTs = VTs; + if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) { + SmallVector NewVTArr(ArrayRef(VTs.VTs, VTs.NumVTs)); + NewVTArr[0] = MVT::x86mmx; + NewVTs = DAG.getVTList(NewVTArr); + MadeChange = true; + CastReturnVal = true; + } + + if (MadeChange) { + SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args); + if (CastReturnVal) { + SmallVector Returns; + for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i) + Returns.push_back(Result.getValue(i)); + Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]); + return DAG.getMergeValues(Returns, dl); + } + return Result; + } + return SDValue(); +} +static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned IntNo = N->getConstantOperandVal(0); + const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo); + + if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX) + return FixupMMXIntrinsicTypes(N, DAG); + + return SDValue(); +} + +static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned IntNo = N->getConstantOperandVal(1); + const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); + + if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX) + return FixupMMXIntrinsicTypes(N, DAG); + + return SDValue(); +} + +static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned IntNo = N->getConstantOperandVal(1); + const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); + + if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX) + return FixupMMXIntrinsicTypes(N, DAG); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -58806,7 +58891,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); - // clang-format on + case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI); + case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI); + case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI); + // clang-format on } return SDValue(); diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 6f51d4141c959c..7c9738bf082164 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -623,11 +623,13 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II, if (isa(Arg)) return Constant::getNullValue(ResTy); - auto *ArgTy = dyn_cast(Arg->getType()); - // We can't easily peek through x86_mmx types. - if (!ArgTy) + // Preserve previous behavior and give up. + // TODO: treat as <8 x i8>. + if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) return nullptr; + auto *ArgTy = cast(Arg->getType()); + // Expand MOVMSK to compare/bitcast/zext: // e.g. PMOVMSKB(v16i8 x): // %cmp = icmp slt <16 x i8> %x, zeroinitializer diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index f9abb16767c91b..86fd04046d16a0 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -74,7 +74,8 @@ enum IntrinsicType : uint16_t { GATHER_AVX2, ROUNDP, ROUNDS, - RDPRU + RDPRU, + INTR_TYPE_CAST_MMX }; struct IntrinsicData { @@ -324,6 +325,8 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(mmx_maskmovq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_movnt_dq, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0), X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), @@ -2019,6 +2022,75 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + + X86_INTRINSIC_DATA(mmx_packssdw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_packsswb, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_packuswb, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padds_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padds_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_paddus_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_paddus_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_palignr_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pand, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pandn, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pavg_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pavg_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpeq_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpeq_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpeq_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpgt_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpgt_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpgt_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pextr_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pinsr_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmadd_wd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmaxs_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmaxu_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmins_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pminu_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmovmskb, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmulh_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmulhu_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmull_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmulu_dq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_por, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psad_bw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psll_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psll_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psll_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pslli_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pslli_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pslli_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psra_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psra_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrai_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrai_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrl_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrl_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrl_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrli_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrli_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrli_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubs_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubs_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubus_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubus_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckhbw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckhdq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckhwd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpcklbw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckldq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpcklwd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pxor, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), @@ -2027,8 +2099,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE), X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT), X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse_cvtpd2pi, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvtpi2pd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvtpi2ps, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvtps2pi, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(sse_cvtss2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), X86_INTRINSIC_DATA(sse_cvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), + X86_INTRINSIC_DATA(sse_cvttpd2pi, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvttps2pi, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(sse_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0), X86_INTRINSIC_DATA(sse_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0), X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), @@ -2036,6 +2114,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0), X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), + X86_INTRINSIC_DATA(sse_pshuf_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), @@ -2118,14 +2197,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0), X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0), + X86_INTRINSIC_DATA(ssse3_pabs_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_pabs_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_pabs_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(ssse3_pmul_hr_sw, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(ssse3_pshuf_b, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(ssse3_psign_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_psign_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_psign_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0), diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 005db42f609766..ed93b4491c50e4 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -977,8 +977,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { } else if (NewCB->getType()->isVoidTy()) { // If the return value is dead, replace any uses of it with poison // (any non-debug value uses will get removed later on). - if (!CB.getType()->isX86_MMXTy()) - CB.replaceAllUsesWith(PoisonValue::get(CB.getType())); + CB.replaceAllUsesWith(PoisonValue::get(CB.getType())); } else { assert((RetTy->isStructTy() || RetTy->isArrayTy()) && "Return type changed, but not into a void. The old return type" @@ -1042,8 +1041,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { } else { // If this argument is dead, replace any uses of it with poison // (any non-debug value uses will get removed later on). - if (!I->getType()->isX86_MMXTy()) - I->replaceAllUsesWith(PoisonValue::get(I->getType())); + I->replaceAllUsesWith(PoisonValue::get(I->getType())); } // If we change the return value of the function we must rewrite any return diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index e98c0a7665237a..c1417afee6b3b3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2750,13 +2750,6 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { return replaceInstUsesWith(CI, Src); if (FixedVectorType *DestVTy = dyn_cast(DestTy)) { - // Beware: messing with this target-specific oddity may cause trouble. - if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) { - Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType()); - return InsertElementInst::Create(PoisonValue::get(DestTy), Elem, - Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); - } - if (isa(SrcTy)) { // If this is a cast from an integer to vector, check to see if the input // is a trunc or zext of a bitcast from vector. If so, we can replace all diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 11435e25183cb7..624ea4cdd814f5 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2967,8 +2967,7 @@ struct MemorySanitizerVisitor : public InstVisitor { /// Caller guarantees that this intrinsic does not access memory. bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) { Type *RetTy = I.getType(); - if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy() || - RetTy->isX86_MMXTy())) + if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy())) return false; unsigned NumArgOperands = I.arg_size(); @@ -3197,7 +3196,7 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } - // Get an X86_MMX-sized vector type. + // Get an MMX-sized vector type. Type *getMMXVectorTy(unsigned EltSizeInBits) { const unsigned X86_MMXSizeInBits = 64; assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 && @@ -3243,20 +3242,21 @@ struct MemorySanitizerVisitor : public InstVisitor { // packs elements of 2 input vectors into half as many bits with saturation. // Shadow is propagated with the signed variant of the same intrinsic applied // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer). - // EltSizeInBits is used only for x86mmx arguments. - void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) { + // MMXEltSizeInBits is used only for x86mmx arguments. + void handleVectorPackIntrinsic(IntrinsicInst &I, + unsigned MMXEltSizeInBits = 0) { assert(I.arg_size() == 2); - bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); IRBuilder<> IRB(&I); Value *S1 = getShadow(&I, 0); Value *S2 = getShadow(&I, 1); - assert(isX86_MMX || S1->getType()->isVectorTy()); + assert(S1->getType()->isVectorTy()); // SExt and ICmpNE below must apply to individual elements of input vectors. // In case of x86mmx arguments, cast them to appropriate vector types and // back. - Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType(); - if (isX86_MMX) { + Type *T = + MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits) : S1->getType(); + if (MMXEltSizeInBits) { S1 = IRB.CreateBitCast(S1, T); S2 = IRB.CreateBitCast(S2, T); } @@ -3264,16 +3264,17 @@ struct MemorySanitizerVisitor : public InstVisitor { IRB.CreateSExt(IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T); Value *S2_ext = IRB.CreateSExt(IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T); - if (isX86_MMX) { - Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C); - S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy); - S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy); + if (MMXEltSizeInBits) { + S1_ext = IRB.CreateBitCast(S1_ext, getMMXVectorTy(64)); + S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64)); } - Value *S = IRB.CreateIntrinsic(getSignedPackIntrinsic(I.getIntrinsicID()), - {}, {S1_ext, S2_ext}, /*FMFSource=*/nullptr, - "_msprop_vector_pack"); - if (isX86_MMX) + Function *ShadowFn = Intrinsic::getDeclaration( + F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID())); + + Value *S = + IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack"); + if (MMXEltSizeInBits) S = IRB.CreateBitCast(S, getShadowTy(&I)); setShadow(&I, S); setOriginForNaryOp(I); @@ -3380,10 +3381,9 @@ struct MemorySanitizerVisitor : public InstVisitor { } // Instrument sum-of-absolute-differences intrinsic. - void handleVectorSadIntrinsic(IntrinsicInst &I) { + void handleVectorSadIntrinsic(IntrinsicInst &I, bool IsMMX = false) { const unsigned SignificantBitsPerResultElement = 16; - bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); - Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType(); + Type *ResTy = IsMMX ? IntegerType::get(*MS.C, 64) : I.getType(); unsigned ZeroBitsPerResultElement = ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement; @@ -3402,9 +3402,9 @@ struct MemorySanitizerVisitor : public InstVisitor { // Instrument multiply-add intrinsic. void handleVectorPmaddIntrinsic(IntrinsicInst &I, - unsigned EltSizeInBits = 0) { - bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); - Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType(); + unsigned MMXEltSizeInBits = 0) { + Type *ResTy = + MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType(); IRBuilder<> IRB(&I); auto *Shadow0 = getShadow(&I, 0); auto *Shadow1 = getShadow(&I, 1); @@ -4236,6 +4236,8 @@ struct MemorySanitizerVisitor : public InstVisitor { break; case Intrinsic::x86_mmx_psad_bw: + handleVectorSadIntrinsic(I, true); + break; case Intrinsic::x86_sse2_psad_bw: case Intrinsic::x86_avx2_psad_bw: handleVectorSadIntrinsic(I); @@ -5152,7 +5154,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase { Type *T = arg->getType(); if (T->isX86_FP80Ty()) return AK_Memory; - if (T->isFPOrFPVectorTy() || T->isX86_MMXTy()) + if (T->isFPOrFPVectorTy()) return AK_FloatingPoint; if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) return AK_GeneralPurpose; diff --git a/llvm/test/Assembler/x86mmx.ll b/llvm/test/Assembler/x86mmx.ll deleted file mode 100644 index 608347e0fceb10..00000000000000 --- a/llvm/test/Assembler/x86mmx.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llvm-as < %s | llvm-dis | FileCheck %s -; RUN: verify-uselistorder %s -; Basic smoke test for x86_mmx type. - -; CHECK: define x86_mmx @sh16 -define x86_mmx @sh16(x86_mmx %A) { -; CHECK: ret x86_mmx %A - ret x86_mmx %A -} diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll index 12dd66957fff4b..c4b932034b501a 100644 --- a/llvm/test/Bindings/llvm-c/echo.ll +++ b/llvm/test/Bindings/llvm-c/echo.ll @@ -70,7 +70,7 @@ define void @types() { %9 = alloca [3 x i22], align 4 %10 = alloca ptr addrspace(5), align 8 %11 = alloca <5 x ptr>, align 64 - %12 = alloca x86_mmx, align 8 + %12 = alloca <1 x i64>, align 8 ret void } diff --git a/llvm/test/Bitcode/bcanalyzer-types.ll b/llvm/test/Bitcode/bcanalyzer-types.ll index cbe6f5d22c9479..f1732db174c295 100644 --- a/llvm/test/Bitcode/bcanalyzer-types.ll +++ b/llvm/test/Bitcode/bcanalyzer-types.ll @@ -3,7 +3,6 @@ ; CHECK: Block ID {{.*}} (TYPE_BLOCK_ID) ; CHECK: BFLOAT ; CHECK: TOKEN -; CHECK: X86_MMX ; CHECK: HALF ; CHECK: Block ID @@ -12,11 +11,6 @@ define half @test_half(half %x, half %y) { ret half %a } -define x86_mmx @test_mmx(<2 x i32> %x) { - %a = bitcast <2 x i32> %x to x86_mmx - ret x86_mmx %a -} - define bfloat @test_bfloat(i16 %x) { %a = bitcast i16 %x to bfloat ret bfloat %a diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll index 2190e2fbccf288..37a87eea41ad36 100644 --- a/llvm/test/Bitcode/compatibility-3.6.ll +++ b/llvm/test/Bitcode/compatibility-3.6.ll @@ -645,7 +645,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll index 7e59b5c1be6e2f..8de2132d7ec892 100644 --- a/llvm/test/Bitcode/compatibility-3.7.ll +++ b/llvm/test/Bitcode/compatibility-3.7.ll @@ -689,7 +689,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll index ebd1f2fff8c94c..7f766aa34a005f 100644 --- a/llvm/test/Bitcode/compatibility-3.8.ll +++ b/llvm/test/Bitcode/compatibility-3.8.ll @@ -742,7 +742,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll index c34f04ceb0de39..c8309175e063f0 100644 --- a/llvm/test/Bitcode/compatibility-3.9.ll +++ b/llvm/test/Bitcode/compatibility-3.9.ll @@ -813,7 +813,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll index 05bffda1d117a3..adbd91ac6c7fe5 100644 --- a/llvm/test/Bitcode/compatibility-4.0.ll +++ b/llvm/test/Bitcode/compatibility-4.0.ll @@ -813,7 +813,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll index 0c872289c62ba8..1b500da69568af 100644 --- a/llvm/test/Bitcode/compatibility-5.0.ll +++ b/llvm/test/Bitcode/compatibility-5.0.ll @@ -820,7 +820,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll index 44c680885be34f..c1abbf0cda6eb9 100644 --- a/llvm/test/Bitcode/compatibility-6.0.ll +++ b/llvm/test/Bitcode/compatibility-6.0.ll @@ -830,7 +830,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index c09eabde8ef87a..a849789da536ac 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -1160,8 +1160,6 @@ define void @typesystem() { ; CHECK: %t5 = alloca x86_fp80 %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 - %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx %t8 = alloca ptr ; CHECK: %t8 = alloca ptr %t9 = alloca <4 x i32> diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll index 69f733461efc77..ba40c5c4627d95 100644 --- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll +++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll @@ -25,10 +25,8 @@ define void @test(<1 x i64> %c64, <1 x i64> %mask1, ptr %P) { ; CHECK-NEXT: popl %edi ; CHECK-NEXT: retl entry: - %tmp4 = bitcast <1 x i64> %mask1 to x86_mmx ; [#uses=1] - %tmp6 = bitcast <1 x i64> %c64 to x86_mmx ; [#uses=1] - tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, ptr %P ) + tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %mask1, <1 x i64> %c64, ptr %P ) ret void } -declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) +declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) diff --git a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll index 79b06ba836af29..6c586782420e15 100644 --- a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll +++ b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | FileCheck %s -@R = external global x86_mmx ; [#uses=1] +@R = external global <1 x i64> ; [#uses=1] define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind { ; CHECK-LABEL: foo: @@ -14,13 +14,11 @@ define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind { ; CHECK-NEXT: emms ; CHECK-NEXT: retq entry: - %tmp4 = bitcast <1 x i64> %B to x86_mmx ; <<4 x i16>> [#uses=1] - %tmp6 = bitcast <1 x i64> %A to x86_mmx ; <<4 x i16>> [#uses=1] - %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 ) ; [#uses=1] - store x86_mmx %tmp7, ptr @R + %tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w( <1 x i64> %A, <1 x i64> %B ) ; <<1 x i64>> [#uses=1] + store <1 x i64> %tmp7, ptr @R tail call void @llvm.x86.mmx.emms( ) ret void } -declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) declare void @llvm.x86.mmx.emms() diff --git a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll index d439e827e81994..0c792644fc5c8a 100644 --- a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll +++ b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll @@ -5,15 +5,15 @@ entry: tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - %tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; [#uses=1] + %tmp1 = tail call <1 x i64> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind ; <<1 x i64>> [#uses=1] tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - %tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind ; [#uses=1] + %tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef ) nounwind ; [#uses=1] tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind + tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef, i32 undef, i32 %tmp3 ) nounwind tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind - %tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind ; [#uses=0] + %tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> %tmp1 ) nounwind ; [#uses=0] ret i32 undef } diff --git a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll index 594edbaad29441..4a4477823a61d3 100644 --- a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll +++ b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll @@ -17,13 +17,13 @@ entry: br i1 false, label %bb.nph144.split, label %bb133 bb.nph144.split: ; preds = %entry - %tmp = bitcast <8 x i8> zeroinitializer to x86_mmx - %tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx - tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, ptr null ) nounwind + %tmp = bitcast <8 x i8> zeroinitializer to <1 x i64> + %tmp2 = bitcast <8 x i8> zeroinitializer to <1 x i64> + tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp, <1 x i64> %tmp2, ptr null ) nounwind unreachable bb133: ; preds = %entry ret void } -declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind +declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind diff --git a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll index ac86279ca6667e..20673a177ac31f 100644 --- a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll +++ b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll @@ -26,25 +26,44 @@ entry: ; This is how to get MMX instructions. -define <2 x double> @a2(x86_mmx %x) nounwind { +define <2 x double> @a2(<1 x i64> %x) nounwind { ; CHECK-LABEL: a2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtpi2pd %mm0, %xmm0 +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: movl 8(%ebp), %eax +; CHECK-NEXT: movl 12(%ebp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: cvtpi2pd (%esp), %xmm0 +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl entry: - %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x) + %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %x) ret <2 x double> %y } -define x86_mmx @b2(<2 x double> %x) nounwind { +define <1 x i64> @b2(<2 x double> %x) nounwind { ; CHECK-LABEL: b2: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $8, %esp ; CHECK-NEXT: cvttpd2pi %xmm0, %mm0 +; CHECK-NEXT: movq %mm0, (%esp) +; CHECK-NEXT: movl (%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl entry: - %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x) - ret x86_mmx %y + %y = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi (<2 x double> %x) + ret <1 x i64> %y } -declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) -declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) +declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) +declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) diff --git a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll index 306aeed1ace3e1..582ebb9bdcfd15 100644 --- a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll +++ b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll @@ -3,14 +3,14 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" target triple = "i386-apple-macosx10.6.6" -%0 = type { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx } +%0 = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } define i32 @pixman_fill_mmx(ptr nocapture %bits, i32 %stride, i32 %bpp, i32 %x, i32 %y, i32 %width, i32 %height, i32 %xor) nounwind ssp { entry: %conv = zext i32 %xor to i64 %shl = shl nuw i64 %conv, 32 %or = or i64 %shl, %conv - %0 = bitcast i64 %or to x86_mmx + %0 = bitcast i64 %or to <1 x i64> ; CHECK: movq [[MMXR:%mm[0-7],]] {{%mm[0-7]}} ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}} ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}} @@ -18,7 +18,7 @@ entry: ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}} ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}} ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}} - %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(x86_mmx %0) nounwind, !srcloc !0 + %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %0) nounwind, !srcloc !0 %asmresult = extractvalue %0 %1, 0 %asmresult6 = extractvalue %0 %1, 1 %asmresult7 = extractvalue %0 %1, 2 @@ -34,7 +34,7 @@ entry: ; CHECK-NEXT: movq {{%mm[0-7]}}, ; CHECK-NEXT: movq {{%mm[0-7]}}, ; CHECK-NEXT: movq {{%mm[0-7]}}, - tail call void asm sideeffect "movq\09$1,\09 ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, x86_mmx %0, x86_mmx %asmresult, x86_mmx %asmresult6, x86_mmx %asmresult7, x86_mmx %asmresult8, x86_mmx %asmresult9, x86_mmx %asmresult10, x86_mmx %asmresult11) nounwind, !srcloc !1 + tail call void asm sideeffect "movq\09$1,\09 ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, <1 x i64> %0, <1 x i64> %asmresult, <1 x i64> %asmresult6, <1 x i64> %asmresult7, <1 x i64> %asmresult8, <1 x i64> %asmresult9, <1 x i64> %asmresult10, <1 x i64> %asmresult11) nounwind, !srcloc !1 tail call void @llvm.x86.mmx.emms() nounwind ret i32 1 } diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index e41e9886a836b5..0bfd8921e8b42a 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -1011,23 +1011,19 @@ define float @broadcast_lifetime() nounwind { ret float %7 } -define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind { +define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind { ; X86-LABEL: broadcast_x86_mmx: ; X86: ## %bb.0: ## %bb -; X86-NEXT: subl $12, %esp -; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-LABEL: broadcast_x86_mmx: ; X64: ## %bb.0: ## %bb -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: vmovq %rdi, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: retq bb: - %tmp1 = bitcast x86_mmx %tmp to i64 + %tmp1 = bitcast <1 x i64> %tmp to i64 %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0 %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16> %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll index 886c3057f82c57..c50af6968f5bb2 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1449,31 +1449,24 @@ eintry: ret void } -define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind { +define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind { ; X86-LABEL: broadcast_x86_mmx: ; X86: ## %bb.0: ## %bb -; X86-NEXT: subl $12, %esp -; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-AVX2-LABEL: broadcast_x86_mmx: ; X64-AVX2: ## %bb.0: ## %bb -; X64-AVX2-NEXT: movdq2q %xmm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, %rax -; X64-AVX2-NEXT: vmovq %rax, %xmm0 +; X64-AVX2-NEXT: vmovq %rdi, %xmm0 ; X64-AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512VL-LABEL: broadcast_x86_mmx: ; X64-AVX512VL: ## %bb.0: ## %bb -; X64-AVX512VL-NEXT: movdq2q %xmm0, %mm0 -; X64-AVX512VL-NEXT: movq %mm0, %rax -; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm0 +; X64-AVX512VL-NEXT: vpbroadcastq %rdi, %xmm0 ; X64-AVX512VL-NEXT: retq bb: - %tmp1 = bitcast x86_mmx %tmp to i64 + %tmp1 = bitcast <1 x i64> %tmp to i64 %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0 %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16> %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/bitcast-mmx.ll b/llvm/test/CodeGen/X86/bitcast-mmx.ll index 061723a0966e2b..fe48a96a51d3ec 100644 --- a/llvm/test/CodeGen/X86/bitcast-mmx.ll +++ b/llvm/test/CodeGen/X86/bitcast-mmx.ll @@ -17,9 +17,9 @@ define i32 @t0(i64 %x) nounwind { ; X64-NEXT: retq entry: %0 = bitcast i64 %x to <4 x i16> - %1 = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 -18) - %3 = bitcast x86_mmx %2 to <4 x i16> + %1 = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 -18) + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 %6 = bitcast i64 %5 to <2 x i32> @@ -52,9 +52,9 @@ define i64 @t1(i64 %x, i32 %n) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = bitcast i64 %x to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %n) - %2 = bitcast x86_mmx %1 to i64 + %0 = bitcast i64 %x to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %n) + %2 = bitcast <1 x i64> %1 to i64 ret i64 %2 } @@ -88,11 +88,11 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind { entry: %0 = insertelement <2 x i32> undef, i32 %w, i32 0 %1 = insertelement <2 x i32> %0, i32 0, i32 1 - %2 = bitcast <2 x i32> %1 to x86_mmx - %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %2, i32 %n) - %4 = bitcast i64 %x to x86_mmx - %5 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %4, x86_mmx %3) - %6 = bitcast x86_mmx %5 to i64 + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %2, i32 %n) + %4 = bitcast i64 %x to <1 x i64> + %5 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %4, <1 x i64> %3) + %6 = bitcast <1 x i64> %5 to i64 ret i64 %6 } @@ -123,14 +123,14 @@ define i64 @t3(ptr %y, ptr %n) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %y, align 8 + %0 = load <1 x i64>, ptr %y, align 8 %1 = load i32, ptr %n, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) -declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) -declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) +declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) +declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir index 559560ac20f8af..aa637e7408f22a 100644 --- a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir +++ b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir @@ -6,9 +6,9 @@ define <2 x i32> @test_paddw(<2 x i32> %a) nounwind readnone { entry: - %0 = bitcast <2 x i32> %a to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %0, x86_mmx %0) - %2 = bitcast x86_mmx %1 to <2 x i32> + %0 = bitcast <2 x i32> %a to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %0, <1 x i64> %0) + %2 = bitcast <1 x i64> %1 to <2 x i32> ret <2 x i32> %2 } diff --git a/llvm/test/CodeGen/X86/fake-use-vector.ll b/llvm/test/CodeGen/X86/fake-use-vector.ll index be1dc123f8023e..1995b42f31ccee 100644 --- a/llvm/test/CodeGen/X86/fake-use-vector.ll +++ b/llvm/test/CodeGen/X86/fake-use-vector.ll @@ -1,6 +1,5 @@ ; assert in DAGlegalizer with fake use of 1-element vectors. ; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -filetype=asm -o - %s | FileCheck %s -; XFAIL: * ; ModuleID = 't2.cpp' ; source_filename = "t2.cpp" ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/CodeGen/X86/fast-isel-bc.ll b/llvm/test/CodeGen/X86/fast-isel-bc.ll index 0fbc9fab056814..64bdfd6d4f8632 100644 --- a/llvm/test/CodeGen/X86/fast-isel-bc.ll +++ b/llvm/test/CodeGen/X86/fast-isel-bc.ll @@ -4,7 +4,7 @@ ; PR4684 -declare void @func2(x86_mmx) +declare void @func2(<1 x i64>) ; This isn't spectacular, but it's MMX code at -O0... @@ -12,7 +12,11 @@ define void @func1() nounwind { ; X86-LABEL: func1: ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp -; X86-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 ## mm0 = 0x200000000 +; X86-NEXT: movl $2, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movl %esp, %eax +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: calll _func2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: retl @@ -20,12 +24,11 @@ define void @func1() nounwind { ; X64-LABEL: func1: ; X64: ## %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 ## mm0 = 0x200000000 -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movabsq $8589934592, %rdi ## imm = 0x200000000 ; X64-NEXT: callq _func2 ; X64-NEXT: popq %rax ; X64-NEXT: retq - %tmp0 = bitcast <2 x i32> to x86_mmx - call void @func2(x86_mmx %tmp0) + %tmp0 = bitcast <2 x i32> to <1 x i64> + call void @func2(<1 x i64> %tmp0) ret void } diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll index c13fdae540d0b8..3b1a8f541b4902 100644 --- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll +++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll @@ -104,12 +104,12 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) { ; ALL-NEXT: movntq %mm0, (%rsi) ; ALL-NEXT: retq entry: - %0 = load x86_mmx, ptr %a0 - %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3) - store x86_mmx %1, ptr %a1, align 8, !nontemporal !1 + %0 = load <1 x i64>, ptr %a0 + %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3) + store <1 x i64> %1, ptr %a1, align 8, !nontemporal !1 ret void } -declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone ; ; 128-bit Vector Stores diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll index a4dbb10e0d7a04..439d7efc2d7551 100644 --- a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll +++ b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll @@ -10,30 +10,29 @@ define void @t3() nounwind { ; X86-64-LABEL: t3: ; X86-64: ## %bb.0: ; X86-64-NEXT: movq _g_v8qi@GOTPCREL(%rip), %rax -; X86-64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-64-NEXT: movb $1, %al +; X86-64-NEXT: movq (%rax), %rdi +; X86-64-NEXT: xorl %eax, %eax ; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL %tmp3 = load <8 x i8>, ptr @g_v8qi, align 8 - %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx - %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind + %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64> + %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind ret void } -define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind { +define void @t4(<1 x i64> %v1, <1 x i64> %v2) nounwind { ; X86-64-LABEL: t4: ; X86-64: ## %bb.0: -; X86-64-NEXT: movdq2q %xmm1, %mm0 -; X86-64-NEXT: movdq2q %xmm0, %mm1 -; X86-64-NEXT: movq2dq %mm1, %xmm1 -; X86-64-NEXT: movq2dq %mm0, %xmm0 -; X86-64-NEXT: paddb %xmm1, %xmm0 -; X86-64-NEXT: movb $1, %al +; X86-64-NEXT: movq %rdi, %xmm0 +; X86-64-NEXT: movq %rsi, %xmm1 +; X86-64-NEXT: paddb %xmm0, %xmm1 +; X86-64-NEXT: movq %xmm1, %rdi +; X86-64-NEXT: xorl %eax, %eax ; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL - %v1a = bitcast x86_mmx %v1 to <8 x i8> - %v2b = bitcast x86_mmx %v2 to <8 x i8> + %v1a = bitcast <1 x i64> %v1 to <8 x i8> + %v2b = bitcast <1 x i64> %v2 to <8 x i8> %tmp3 = add <8 x i8> %v1a, %v2b - %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx - %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind + %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64> + %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind ret void } diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing.ll b/llvm/test/CodeGen/X86/mmx-arg-passing.ll index af116a2ac281b3..d933149c5e027e 100644 --- a/llvm/test/CodeGen/X86/mmx-arg-passing.ll +++ b/llvm/test/CodeGen/X86/mmx-arg-passing.ll @@ -8,26 +8,28 @@ ; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7]. ; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs. -@u1 = external global x86_mmx +@u1 = external global <1 x i64> -define void @t1(x86_mmx %v1) nounwind { +define void @t1(<1 x i64> %v1) nounwind { ; X86-32-LABEL: t1: ; X86-32: ## %bb.0: -; X86-32-NEXT: movl L_u1$non_lazy_ptr, %eax -; X86-32-NEXT: movq %mm0, (%eax) +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movl L_u1$non_lazy_ptr, %edx +; X86-32-NEXT: movl %ecx, 4(%edx) +; X86-32-NEXT: movl %eax, (%edx) ; X86-32-NEXT: retl ; ; X86-64-LABEL: t1: ; X86-64: ## %bb.0: -; X86-64-NEXT: movdq2q %xmm0, %mm0 ; X86-64-NEXT: movq _u1@GOTPCREL(%rip), %rax -; X86-64-NEXT: movq %mm0, (%rax) +; X86-64-NEXT: movq %rdi, (%rax) ; X86-64-NEXT: retq - store x86_mmx %v1, ptr @u1, align 8 + store <1 x i64> %v1, ptr @u1, align 8 ret void } -@u2 = external global x86_mmx +@u2 = external global <1 x i64> define void @t2(<1 x i64> %v1) nounwind { ; X86-32-LABEL: t2: @@ -44,7 +46,6 @@ define void @t2(<1 x i64> %v1) nounwind { ; X86-64-NEXT: movq _u2@GOTPCREL(%rip), %rax ; X86-64-NEXT: movq %rdi, (%rax) ; X86-64-NEXT: retq - %tmp = bitcast <1 x i64> %v1 to x86_mmx - store x86_mmx %tmp, ptr @u2, align 8 + store <1 x i64> %v1, ptr @u2, align 8 ret void } diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index f9ef3dda78cfcf..73d459ba770264 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -18,8 +18,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X86-NEXT: paddsb (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: paddusb (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: psubb %xmm1, %xmm0 ; X86-NEXT: movdq2q %xmm0, %mm0 @@ -27,8 +27,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X86-NEXT: psubsb (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: psubusb (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -58,8 +58,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X64-NEXT: paddsb (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: paddusb (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: movdq2q %xmm0, %mm0 @@ -67,8 +67,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X64-NEXT: psubsb (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: psubusb (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -88,53 +88,53 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X64-NEXT: emms ; X64-NEXT: retq entry: - %tmp1 = load x86_mmx, ptr %A - %tmp3 = load x86_mmx, ptr %B - %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8> - %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8> + %tmp1 = load <1 x i64>, ptr %A + %tmp3 = load <1 x i64>, ptr %B + %tmp1a = bitcast <1 x i64> %tmp1 to <8 x i8> + %tmp3a = bitcast <1 x i64> %tmp3 to <8 x i8> %tmp4 = add <8 x i8> %tmp1a, %tmp3a - %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx - store x86_mmx %tmp4a, ptr %A - %tmp7 = load x86_mmx, ptr %B - %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7) - store x86_mmx %tmp12, ptr %A - %tmp16 = load x86_mmx, ptr %B - %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16) - store x86_mmx %tmp21, ptr %A - %tmp27 = load x86_mmx, ptr %B - %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8> - %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8> + %tmp4a = bitcast <8 x i8> %tmp4 to <1 x i64> + store <1 x i64> %tmp4a, ptr %A + %tmp7 = load <1 x i64>, ptr %B + %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %tmp4a, <1 x i64> %tmp7) + store <1 x i64> %tmp12, ptr %A + %tmp16 = load <1 x i64>, ptr %B + %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %tmp12, <1 x i64> %tmp16) + store <1 x i64> %tmp21, ptr %A + %tmp27 = load <1 x i64>, ptr %B + %tmp21a = bitcast <1 x i64> %tmp21 to <8 x i8> + %tmp27a = bitcast <1 x i64> %tmp27 to <8 x i8> %tmp28 = sub <8 x i8> %tmp21a, %tmp27a - %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx - store x86_mmx %tmp28a, ptr %A - %tmp31 = load x86_mmx, ptr %B - %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31) - store x86_mmx %tmp36, ptr %A - %tmp40 = load x86_mmx, ptr %B - %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40) - store x86_mmx %tmp45, ptr %A - %tmp51 = load x86_mmx, ptr %B - %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8> - %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8> + %tmp28a = bitcast <8 x i8> %tmp28 to <1 x i64> + store <1 x i64> %tmp28a, ptr %A + %tmp31 = load <1 x i64>, ptr %B + %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %tmp28a, <1 x i64> %tmp31) + store <1 x i64> %tmp36, ptr %A + %tmp40 = load <1 x i64>, ptr %B + %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %tmp36, <1 x i64> %tmp40) + store <1 x i64> %tmp45, ptr %A + %tmp51 = load <1 x i64>, ptr %B + %tmp45a = bitcast <1 x i64> %tmp45 to <8 x i8> + %tmp51a = bitcast <1 x i64> %tmp51 to <8 x i8> %tmp52 = mul <8 x i8> %tmp45a, %tmp51a - %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx - store x86_mmx %tmp52a, ptr %A - %tmp57 = load x86_mmx, ptr %B - %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8> + %tmp52a = bitcast <8 x i8> %tmp52 to <1 x i64> + store <1 x i64> %tmp52a, ptr %A + %tmp57 = load <1 x i64>, ptr %B + %tmp57a = bitcast <1 x i64> %tmp57 to <8 x i8> %tmp58 = and <8 x i8> %tmp52, %tmp57a - %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx - store x86_mmx %tmp58a, ptr %A - %tmp63 = load x86_mmx, ptr %B - %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8> + %tmp58a = bitcast <8 x i8> %tmp58 to <1 x i64> + store <1 x i64> %tmp58a, ptr %A + %tmp63 = load <1 x i64>, ptr %B + %tmp63a = bitcast <1 x i64> %tmp63 to <8 x i8> %tmp64 = or <8 x i8> %tmp58, %tmp63a - %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx - store x86_mmx %tmp64a, ptr %A - %tmp69 = load x86_mmx, ptr %B - %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8> - %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8> + %tmp64a = bitcast <8 x i8> %tmp64 to <1 x i64> + store <1 x i64> %tmp64a, ptr %A + %tmp69 = load <1 x i64>, ptr %B + %tmp69a = bitcast <1 x i64> %tmp69 to <8 x i8> + %tmp64b = bitcast <1 x i64> %tmp64a to <8 x i8> %tmp70 = xor <8 x i8> %tmp64b, %tmp69a - %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx - store x86_mmx %tmp70a, ptr %A + %tmp70a = bitcast <8 x i8> %tmp70 to <1 x i64> + store <1 x i64> %tmp70a, ptr %A tail call void @llvm.x86.mmx.emms() ret void } @@ -196,42 +196,42 @@ define void @test1(ptr %A, ptr %B) nounwind { ; X64-NEXT: emms ; X64-NEXT: retq entry: - %tmp1 = load x86_mmx, ptr %A - %tmp3 = load x86_mmx, ptr %B - %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32> - %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32> + %tmp1 = load <1 x i64>, ptr %A + %tmp3 = load <1 x i64>, ptr %B + %tmp1a = bitcast <1 x i64> %tmp1 to <2 x i32> + %tmp3a = bitcast <1 x i64> %tmp3 to <2 x i32> %tmp4 = add <2 x i32> %tmp1a, %tmp3a - %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx - store x86_mmx %tmp4a, ptr %A - %tmp9 = load x86_mmx, ptr %B - %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32> + %tmp4a = bitcast <2 x i32> %tmp4 to <1 x i64> + store <1 x i64> %tmp4a, ptr %A + %tmp9 = load <1 x i64>, ptr %B + %tmp9a = bitcast <1 x i64> %tmp9 to <2 x i32> %tmp10 = sub <2 x i32> %tmp4, %tmp9a - %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx - store x86_mmx %tmp10a, ptr %A - %tmp15 = load x86_mmx, ptr %B - %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32> - %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32> + %tmp10a = bitcast <2 x i32> %tmp4 to <1 x i64> + store <1 x i64> %tmp10a, ptr %A + %tmp15 = load <1 x i64>, ptr %B + %tmp10b = bitcast <1 x i64> %tmp10a to <2 x i32> + %tmp15a = bitcast <1 x i64> %tmp15 to <2 x i32> %tmp16 = mul <2 x i32> %tmp10b, %tmp15a - %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx - store x86_mmx %tmp16a, ptr %A - %tmp21 = load x86_mmx, ptr %B - %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32> - %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32> + %tmp16a = bitcast <2 x i32> %tmp16 to <1 x i64> + store <1 x i64> %tmp16a, ptr %A + %tmp21 = load <1 x i64>, ptr %B + %tmp16b = bitcast <1 x i64> %tmp16a to <2 x i32> + %tmp21a = bitcast <1 x i64> %tmp21 to <2 x i32> %tmp22 = and <2 x i32> %tmp16b, %tmp21a - %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx - store x86_mmx %tmp22a, ptr %A - %tmp27 = load x86_mmx, ptr %B - %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32> - %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32> + %tmp22a = bitcast <2 x i32> %tmp22 to <1 x i64> + store <1 x i64> %tmp22a, ptr %A + %tmp27 = load <1 x i64>, ptr %B + %tmp22b = bitcast <1 x i64> %tmp22a to <2 x i32> + %tmp27a = bitcast <1 x i64> %tmp27 to <2 x i32> %tmp28 = or <2 x i32> %tmp22b, %tmp27a - %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx - store x86_mmx %tmp28a, ptr %A - %tmp33 = load x86_mmx, ptr %B - %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32> - %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32> + %tmp28a = bitcast <2 x i32> %tmp28 to <1 x i64> + store <1 x i64> %tmp28a, ptr %A + %tmp33 = load <1 x i64>, ptr %B + %tmp28b = bitcast <1 x i64> %tmp28a to <2 x i32> + %tmp33a = bitcast <1 x i64> %tmp33 to <2 x i32> %tmp34 = xor <2 x i32> %tmp28b, %tmp33a - %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx - store x86_mmx %tmp34a, ptr %A + %tmp34a = bitcast <2 x i32> %tmp34 to <1 x i64> + store <1 x i64> %tmp34a, ptr %A tail call void @llvm.x86.mmx.emms( ) ret void } @@ -239,8 +239,13 @@ entry: define void @test2(ptr %A, ptr %B) nounwind { ; X86-LABEL: test2: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: paddw %xmm0, %xmm1 @@ -249,8 +254,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X86-NEXT: paddsw (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: paddusw (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: psubw %xmm1, %xmm0 ; X86-NEXT: movdq2q %xmm0, %mm0 @@ -258,8 +263,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X86-NEXT: psubsw (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: psubusw (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: pmullw %xmm0, %xmm1 ; X86-NEXT: movdq2q %xmm1, %mm0 @@ -267,18 +272,26 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X86-NEXT: pmulhw (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: pmaddwd (%ecx), %mm0 +; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: movl (%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: movq2dq %mm0, %xmm0 -; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: andps %xmm0, %xmm1 -; X86-NEXT: movlps %xmm1, (%eax) -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: orps %xmm1, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) -; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: xorps %xmm0, %xmm1 -; X86-NEXT: movlps %xmm1, (%eax) +; X86-NEXT: andl 4(%ecx), %esi +; X86-NEXT: movd %esi, %xmm0 +; X86-NEXT: andl (%ecx), %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%eax) +; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: pxor %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: emms +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: test2: @@ -291,8 +304,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X64-NEXT: paddsw (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: paddusw (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: psubw %xmm1, %xmm0 ; X64-NEXT: movdq2q %xmm0, %mm0 @@ -300,8 +313,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X64-NEXT: psubsw (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: psubusw (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: pmullw %xmm0, %xmm1 ; X64-NEXT: movdq2q %xmm1, %mm0 @@ -309,76 +322,75 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X64-NEXT: pmulhw (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: pmaddwd (%rsi), %mm0 +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: andps %xmm0, %xmm1 -; X64-NEXT: movlps %xmm1, (%rdi) -; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: orps %xmm1, %xmm0 -; X64-NEXT: movlps %xmm0, (%rdi) -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: xorps %xmm0, %xmm1 -; X64-NEXT: movlps %xmm1, (%rdi) +; X64-NEXT: andq (%rsi), %rax +; X64-NEXT: movq %rax, %xmm0 +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq entry: - %tmp1 = load x86_mmx, ptr %A - %tmp3 = load x86_mmx, ptr %B - %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16> - %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16> + %tmp1 = load <1 x i64>, ptr %A + %tmp3 = load <1 x i64>, ptr %B + %tmp1a = bitcast <1 x i64> %tmp1 to <4 x i16> + %tmp3a = bitcast <1 x i64> %tmp3 to <4 x i16> %tmp4 = add <4 x i16> %tmp1a, %tmp3a - %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx - store x86_mmx %tmp4a, ptr %A - %tmp7 = load x86_mmx, ptr %B - %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7) - store x86_mmx %tmp12, ptr %A - %tmp16 = load x86_mmx, ptr %B - %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16) - store x86_mmx %tmp21, ptr %A - %tmp27 = load x86_mmx, ptr %B - %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16> - %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16> + %tmp4a = bitcast <4 x i16> %tmp4 to <1 x i64> + store <1 x i64> %tmp4a, ptr %A + %tmp7 = load <1 x i64>, ptr %B + %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %tmp4a, <1 x i64> %tmp7) + store <1 x i64> %tmp12, ptr %A + %tmp16 = load <1 x i64>, ptr %B + %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %tmp12, <1 x i64> %tmp16) + store <1 x i64> %tmp21, ptr %A + %tmp27 = load <1 x i64>, ptr %B + %tmp21a = bitcast <1 x i64> %tmp21 to <4 x i16> + %tmp27a = bitcast <1 x i64> %tmp27 to <4 x i16> %tmp28 = sub <4 x i16> %tmp21a, %tmp27a - %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx - store x86_mmx %tmp28a, ptr %A - %tmp31 = load x86_mmx, ptr %B - %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31) - store x86_mmx %tmp36, ptr %A - %tmp40 = load x86_mmx, ptr %B - %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40) - store x86_mmx %tmp45, ptr %A - %tmp51 = load x86_mmx, ptr %B - %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16> - %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16> + %tmp28a = bitcast <4 x i16> %tmp28 to <1 x i64> + store <1 x i64> %tmp28a, ptr %A + %tmp31 = load <1 x i64>, ptr %B + %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %tmp28a, <1 x i64> %tmp31) + store <1 x i64> %tmp36, ptr %A + %tmp40 = load <1 x i64>, ptr %B + %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %tmp36, <1 x i64> %tmp40) + store <1 x i64> %tmp45, ptr %A + %tmp51 = load <1 x i64>, ptr %B + %tmp45a = bitcast <1 x i64> %tmp45 to <4 x i16> + %tmp51a = bitcast <1 x i64> %tmp51 to <4 x i16> %tmp52 = mul <4 x i16> %tmp45a, %tmp51a - %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx - store x86_mmx %tmp52a, ptr %A - %tmp55 = load x86_mmx, ptr %B - %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55) - store x86_mmx %tmp60, ptr %A - %tmp64 = load x86_mmx, ptr %B - %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64) - %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx - store x86_mmx %tmp70, ptr %A - %tmp75 = load x86_mmx, ptr %B - %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16> - %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16> + %tmp52a = bitcast <4 x i16> %tmp52 to <1 x i64> + store <1 x i64> %tmp52a, ptr %A + %tmp55 = load <1 x i64>, ptr %B + %tmp60 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %tmp52a, <1 x i64> %tmp55) + store <1 x i64> %tmp60, ptr %A + %tmp64 = load <1 x i64>, ptr %B + %tmp69 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %tmp60, <1 x i64> %tmp64) + store <1 x i64> %tmp69, ptr %A + %tmp75 = load <1 x i64>, ptr %B + %tmp70a = bitcast <1 x i64> %tmp69 to <4 x i16> + %tmp75a = bitcast <1 x i64> %tmp75 to <4 x i16> %tmp76 = and <4 x i16> %tmp70a, %tmp75a - %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx - store x86_mmx %tmp76a, ptr %A - %tmp81 = load x86_mmx, ptr %B - %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16> - %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16> + %tmp76a = bitcast <4 x i16> %tmp76 to <1 x i64> + store <1 x i64> %tmp76a, ptr %A + %tmp81 = load <1 x i64>, ptr %B + %tmp76b = bitcast <1 x i64> %tmp76a to <4 x i16> + %tmp81a = bitcast <1 x i64> %tmp81 to <4 x i16> %tmp82 = or <4 x i16> %tmp76b, %tmp81a - %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx - store x86_mmx %tmp82a, ptr %A - %tmp87 = load x86_mmx, ptr %B - %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16> - %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16> + %tmp82a = bitcast <4 x i16> %tmp82 to <1 x i64> + store <1 x i64> %tmp82a, ptr %A + %tmp87 = load <1 x i64>, ptr %B + %tmp82b = bitcast <1 x i64> %tmp82a to <4 x i16> + %tmp87a = bitcast <1 x i64> %tmp87 to <4 x i16> %tmp88 = xor <4 x i16> %tmp82b, %tmp87a - %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx - store x86_mmx %tmp88a, ptr %A + %tmp88a = bitcast <4 x i16> %tmp88 to <1 x i64> + store <1 x i64> %tmp88a, ptr %A tail call void @llvm.x86.mmx.emms( ) ret void } @@ -574,10 +586,10 @@ define void @ti8a(double %a, double %b) nounwind { ; X64-NEXT: movq %mm1, 0 ; X64-NEXT: retq entry: - %tmp1 = bitcast double %a to x86_mmx - %tmp2 = bitcast double %b to x86_mmx - %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2) - store x86_mmx %tmp3, ptr null + %tmp1 = bitcast double %a to <1 x i64> + %tmp2 = bitcast double %b to <1 x i64> + %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %tmp1, <1 x i64> %tmp2) + store <1 x i64> %tmp3, ptr null ret void } @@ -597,10 +609,10 @@ define void @ti16a(double %a, double %b) nounwind { ; X64-NEXT: movq %mm1, 0 ; X64-NEXT: retq entry: - %tmp1 = bitcast double %a to x86_mmx - %tmp2 = bitcast double %b to x86_mmx - %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2) - store x86_mmx %tmp3, ptr null + %tmp1 = bitcast double %a to <1 x i64> + %tmp2 = bitcast double %b to <1 x i64> + %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %tmp1, <1 x i64> %tmp2) + store <1 x i64> %tmp3, ptr null ret void } @@ -620,10 +632,10 @@ define void @ti32a(double %a, double %b) nounwind { ; X64-NEXT: movq %mm1, 0 ; X64-NEXT: retq entry: - %tmp1 = bitcast double %a to x86_mmx - %tmp2 = bitcast double %b to x86_mmx - %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2) - store x86_mmx %tmp3, ptr null + %tmp1 = bitcast double %a to <1 x i64> + %tmp2 = bitcast double %b to <1 x i64> + %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %tmp1, <1 x i64> %tmp2) + store <1 x i64> %tmp3, ptr null ret void } @@ -643,10 +655,10 @@ define void @ti64a(double %a, double %b) nounwind { ; X64-NEXT: movq %mm1, 0 ; X64-NEXT: retq entry: - %tmp1 = bitcast double %a to x86_mmx - %tmp2 = bitcast double %b to x86_mmx - %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2) - store x86_mmx %tmp3, ptr null + %tmp1 = bitcast double %a to <1 x i64> + %tmp2 = bitcast double %b to <1 x i64> + %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %tmp1, <1 x i64> %tmp2) + store <1 x i64> %tmp3, ptr null ret void } @@ -674,28 +686,28 @@ define i64 @pr43922() nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx bitcast (<2 x i32> to x86_mmx), i32 268435456) - %1 = bitcast x86_mmx %0 to i64 + %0 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> bitcast (<2 x i32> to <1 x i64>), i32 268435456) + %1 = bitcast <1 x i64> %0 to i64 ret i64 %1 } -declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) -declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) -declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) declare void @llvm.x86.mmx.emms() -declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll index 0fa7b24ff445aa..fb2517f5a891be 100644 --- a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll +++ b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll @@ -1,12 +1,12 @@ ; RUN: opt -mtriple=x86_64-- -passes=early-cse -earlycse-debug-hash < %s -S | FileCheck %s -; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx)) +; CHECK: @foo(<1 x i64> zeroinitializer) define void @bar() { entry: - %0 = bitcast double 0.0 to x86_mmx - %1 = call x86_mmx @foo(x86_mmx %0) + %0 = bitcast double 0.0 to <1 x i64> + %1 = call <1 x i64> @foo(<1 x i64> %0) ret void } -declare x86_mmx @foo(x86_mmx) +declare <1 x i64> @foo(<1 x i64>) diff --git a/llvm/test/CodeGen/X86/mmx-bitcast.ll b/llvm/test/CodeGen/X86/mmx-bitcast.ll index f914b8622fcf4b..5e5be820dd5b42 100644 --- a/llvm/test/CodeGen/X86/mmx-bitcast.ll +++ b/llvm/test/CodeGen/X86/mmx-bitcast.ll @@ -8,9 +8,9 @@ define i64 @t0(ptr %p) { ; CHECK-NEXT: paddq %mm0, %mm0 ; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %t = load x86_mmx, ptr %p - %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t) - %s = bitcast x86_mmx %u to i64 + %t = load <1 x i64>, ptr %p + %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %t) + %s = bitcast <1 x i64> %u to i64 ret i64 %s } @@ -21,9 +21,9 @@ define i64 @t1(ptr %p) { ; CHECK-NEXT: paddd %mm0, %mm0 ; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %t = load x86_mmx, ptr %p - %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t) - %s = bitcast x86_mmx %u to i64 + %t = load <1 x i64>, ptr %p + %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %t) + %s = bitcast <1 x i64> %u to i64 ret i64 %s } @@ -34,9 +34,9 @@ define i64 @t2(ptr %p) { ; CHECK-NEXT: paddw %mm0, %mm0 ; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %t = load x86_mmx, ptr %p - %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t) - %s = bitcast x86_mmx %u to i64 + %t = load <1 x i64>, ptr %p + %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %t) + %s = bitcast <1 x i64> %u to i64 ret i64 %s } @@ -47,29 +47,27 @@ define i64 @t3(ptr %p) { ; CHECK-NEXT: paddb %mm0, %mm0 ; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %t = load x86_mmx, ptr %p - %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t) - %s = bitcast x86_mmx %u to i64 + %t = load <1 x i64>, ptr %p + %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %t) + %s = bitcast <1 x i64> %u to i64 ret i64 %s } -@R = external global x86_mmx +@R = external global <1 x i64> define void @t4(<1 x i64> %A, <1 x i64> %B) { ; CHECK-LABEL: t4: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movq %rdi, %mm0 -; CHECK-NEXT: movq %rsi, %mm1 +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 ; CHECK-NEXT: paddusw %mm0, %mm1 ; CHECK-NEXT: movq _R@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq %mm1, (%rax) ; CHECK-NEXT: emms ; CHECK-NEXT: retq entry: - %tmp2 = bitcast <1 x i64> %A to x86_mmx - %tmp3 = bitcast <1 x i64> %B to x86_mmx - %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp2, x86_mmx %tmp3) - store x86_mmx %tmp7, ptr @R + %tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %A, <1 x i64> %B) + store <1 x i64> %tmp7, ptr @R tail call void @llvm.x86.mmx.emms() ret void } @@ -88,7 +86,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone { ret i64 %conv } -declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) define <1 x i64> @t6(i64 %t) { ; CHECK-LABEL: t6: @@ -98,16 +96,14 @@ define <1 x i64> @t6(i64 %t) { ; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %t1 = insertelement <1 x i64> undef, i64 %t, i32 0 - %t0 = bitcast <1 x i64> %t1 to x86_mmx - %t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48) - %t3 = bitcast x86_mmx %t2 to <1 x i64> - ret <1 x i64> %t3 + %t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48) + ret <1 x i64> %t2 } -declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) declare void @llvm.x86.mmx.emms() diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll index b919c9a33ea2f9..d8a010bacc683d 100644 --- a/llvm/test/CodeGen/X86/mmx-build-vector.ll +++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll @@ -8,7 +8,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2 | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefix=X64 -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) ; ; v2i32 @@ -35,9 +35,9 @@ define void @build_v2i32_01(ptr%p0, i32 %a0, i32 %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 %a0, i32 0 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1 - %3 = bitcast <2 x i32> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -58,9 +58,9 @@ define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 %a0, i32 0 %2 = insertelement <2 x i32> %1, i32 0, i32 1 - %3 = bitcast <2 x i32> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -92,9 +92,9 @@ define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 undef, i32 0 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1 - %3 = bitcast <2 x i32> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -119,9 +119,9 @@ define void @build_v2i32_z1(ptr%p0, i32 %a0, i32 %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 0, i32 0 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1 - %3 = bitcast <2 x i32> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -153,9 +153,9 @@ define void @build_v2i32_00(ptr%p0, i32 %a0, i32 %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 %a0, i32 0 %2 = insertelement <2 x i32> %1, i32 %a0, i32 1 - %3 = bitcast <2 x i32> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -194,9 +194,9 @@ define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi %2 = insertelement <4 x i16> %1, i16 %a1, i32 1 %3 = insertelement <4 x i16> %2, i16 %a2, i32 2 %4 = insertelement <4 x i16> %3, i16 %a3, i32 3 - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) - store x86_mmx %6, ptr%p0 + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5) + store <1 x i64> %6, ptr%p0 ret void } @@ -229,9 +229,9 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi %2 = insertelement <4 x i16> %1, i16 %a1, i32 1 %3 = insertelement <4 x i16> %2, i16 0, i32 2 %4 = insertelement <4 x i16> %3, i16 0, i32 3 - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) - store x86_mmx %6, ptr%p0 + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5) + store <1 x i64> %6, ptr%p0 ret void } @@ -254,9 +254,9 @@ define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi %2 = insertelement <4 x i16> %1, i16 undef, i32 1 %3 = insertelement <4 x i16> %2, i16 undef, i32 2 %4 = insertelement <4 x i16> %3, i16 0, i32 3 - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) - store x86_mmx %6, ptr%p0 + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5) + store <1 x i64> %6, ptr%p0 ret void } @@ -281,9 +281,9 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi %2 = insertelement <4 x i16> %1, i16 0, i32 1 %3 = insertelement <4 x i16> %2, i16 undef, i32 2 %4 = insertelement <4 x i16> %3, i16 0, i32 3 - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) - store x86_mmx %6, ptr%p0 + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5) + store <1 x i64> %6, ptr%p0 ret void } @@ -316,9 +316,9 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi %2 = insertelement <4 x i16> %1, i16 %a1, i32 1 %3 = insertelement <4 x i16> %2, i16 %a2, i32 2 %4 = insertelement <4 x i16> %3, i16 undef, i32 3 - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) - store x86_mmx %6, ptr%p0 + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5) + store <1 x i64> %6, ptr%p0 ret void } @@ -353,9 +353,9 @@ define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi %2 = insertelement <4 x i16> %1, i16 undef, i32 1 %3 = insertelement <4 x i16> %2, i16 %a0, i32 2 %4 = insertelement <4 x i16> %3, i16 %a0, i32 3 - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) - store x86_mmx %6, ptr%p0 + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5) + store <1 x i64> %6, ptr%p0 ret void } @@ -414,9 +414,9 @@ define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %6 = insertelement <8 x i8> %5, i8 %a5, i32 5 %7 = insertelement <8 x i8> %6, i8 %a6, i32 6 %8 = insertelement <8 x i8> %7, i8 %a7, i32 7 - %9 = bitcast <8 x i8> %8 to x86_mmx - %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9) - store x86_mmx %10, ptr%p0 + %9 = bitcast <8 x i8> %8 to <1 x i64> + %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9) + store <1 x i64> %10, ptr%p0 ret void } @@ -469,9 +469,9 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %6 = insertelement <8 x i8> %5, i8 %a5, i32 5 %7 = insertelement <8 x i8> %6, i8 0, i32 6 %8 = insertelement <8 x i8> %7, i8 %a7, i32 7 - %9 = bitcast <8 x i8> %8 to x86_mmx - %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9) - store x86_mmx %10, ptr%p0 + %9 = bitcast <8 x i8> %8 to <1 x i64> + %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9) + store <1 x i64> %10, ptr%p0 ret void } @@ -522,9 +522,9 @@ define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %6 = insertelement <8 x i8> %5, i8 0, i32 5 %7 = insertelement <8 x i8> %6, i8 0, i32 6 %8 = insertelement <8 x i8> %7, i8 undef, i32 7 - %9 = bitcast <8 x i8> %8 to x86_mmx - %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9) - store x86_mmx %10, ptr%p0 + %9 = bitcast <8 x i8> %8 to <1 x i64> + %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9) + store <1 x i64> %10, ptr%p0 ret void } @@ -551,9 +551,9 @@ define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %6 = insertelement <8 x i8> %5, i8 0, i32 5 %7 = insertelement <8 x i8> %6, i8 0, i32 6 %8 = insertelement <8 x i8> %7, i8 0, i32 7 - %9 = bitcast <8 x i8> %8 to x86_mmx - %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9) - store x86_mmx %10, ptr%p0 + %9 = bitcast <8 x i8> %8 to <1 x i64> + %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9) + store <1 x i64> %10, ptr%p0 ret void } @@ -582,9 +582,9 @@ define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %6 = insertelement <8 x i8> %5, i8 0, i32 5 %7 = insertelement <8 x i8> %6, i8 0, i32 6 %8 = insertelement <8 x i8> %7, i8 undef, i32 7 - %9 = bitcast <8 x i8> %8 to x86_mmx - %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9) - store x86_mmx %10, ptr%p0 + %9 = bitcast <8 x i8> %8 to <1 x i64> + %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9) + store <1 x i64> %10, ptr%p0 ret void } @@ -626,9 +626,9 @@ define void @build_v8i8_00000000(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, %6 = insertelement <8 x i8> %5, i8 %a0, i32 5 %7 = insertelement <8 x i8> %6, i8 %a0, i32 6 %8 = insertelement <8 x i8> %7, i8 %a0, i32 7 - %9 = bitcast <8 x i8> %8 to x86_mmx - %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9) - store x86_mmx %10, ptr%p0 + %9 = bitcast <8 x i8> %8 to <1 x i64> + %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9) + store <1 x i64> %10, ptr%p0 ret void } @@ -669,9 +669,9 @@ define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x float> undef, float %a0, i32 0 %2 = insertelement <2 x float> %1, float %a1, i32 1 - %3 = bitcast <2 x float> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x float> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -707,9 +707,9 @@ define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x float> undef, float %a0, i32 0 %2 = insertelement <2 x float> %1, float 0.0, i32 1 - %3 = bitcast <2 x float> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x float> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -742,9 +742,9 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x float> undef, float undef, i32 0 %2 = insertelement <2 x float> %1, float %a1, i32 1 - %3 = bitcast <2 x float> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x float> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -780,9 +780,9 @@ define void @build_v2f32_z1(ptr%p0, float %a0, float %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x float> undef, float 0.0, i32 0 %2 = insertelement <2 x float> %1, float %a1, i32 1 - %3 = bitcast <2 x float> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x float> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } @@ -815,8 +815,8 @@ define void @build_v2f32_00(ptr%p0, float %a0, float %a1) nounwind { ; X64-NEXT: retq %1 = insertelement <2 x float> undef, float %a0, i32 0 %2 = insertelement <2 x float> %1, float %a0, i32 1 - %3 = bitcast <2 x float> %2 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3) - store x86_mmx %4, ptr%p0 + %3 = bitcast <2 x float> %2 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3) + store <1 x i64> %4, ptr%p0 ret void } diff --git a/llvm/test/CodeGen/X86/mmx-coalescing.ll b/llvm/test/CodeGen/X86/mmx-coalescing.ll index dac526fe20bbf0..589f5af4bb4d64 100644 --- a/llvm/test/CodeGen/X86/mmx-coalescing.ll +++ b/llvm/test/CodeGen/X86/mmx-coalescing.ll @@ -42,9 +42,9 @@ entry: %SA2 = getelementptr inbounds %SA, ptr %pSA, i64 0, i32 4 %v3 = load ptr, ptr %SA2, align 8 %v4 = bitcast <1 x i64> %v0 to <4 x i16> - %v5 = bitcast <4 x i16> %v4 to x86_mmx - %v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18) - %v7 = bitcast x86_mmx %v6 to <4 x i16> + %v5 = bitcast <4 x i16> %v4 to <1 x i64> + %v6 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v5, i8 -18) + %v7 = bitcast <1 x i64> %v6 to <4 x i16> %v8 = bitcast <4 x i16> %v7 to <1 x i64> %v9 = extractelement <1 x i64> %v8, i32 0 %v10 = bitcast i64 %v9 to <2 x i32> @@ -55,18 +55,18 @@ entry: if.A: %pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ] %v17 = extractelement <1 x i64> %pa, i32 0 - %v18 = bitcast i64 %v17 to x86_mmx - %v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2 - %v20 = bitcast x86_mmx %v19 to i64 + %v18 = bitcast i64 %v17 to <1 x i64> + %v19 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %v18, i32 %B) #2 + %v20 = bitcast <1 x i64> %v19 to i64 %v21 = insertelement <1 x i64> undef, i64 %v20, i32 0 %cmp3 = icmp eq i64 %v20, 0 br i1 %cmp3, label %if.C, label %merge if.B: %v34 = bitcast <1 x i64> %v8 to <4 x i16> - %v35 = bitcast <4 x i16> %v34 to x86_mmx - %v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18) - %v37 = bitcast x86_mmx %v36 to <4 x i16> + %v35 = bitcast <4 x i16> %v34 to <1 x i64> + %v36 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v35, i8 -18) + %v37 = bitcast <1 x i64> %v36 to <4 x i16> %v38 = bitcast <4 x i16> %v37 to <1 x i64> br label %if.C @@ -80,9 +80,9 @@ if.C: merge: %vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ] %v130 = bitcast <1 x i64> %vy to <4 x i16> - %v131 = bitcast <4 x i16> %v130 to x86_mmx - %v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18) - %v133 = bitcast x86_mmx %v132 to <4 x i16> + %v131 = bitcast <4 x i16> %v130 to <1 x i64> + %v132 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v131, i8 -18) + %v133 = bitcast <1 x i64> %v132 to <4 x i16> %v134 = bitcast <4 x i16> %v133 to <1 x i64> %v135 = extractelement <1 x i64> %v134, i32 0 %v136 = bitcast i64 %v135 to <2 x i32> @@ -91,5 +91,5 @@ merge: } -declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) -declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) +declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) +declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll index c09c417c11c966..51a71dab37f6da 100644 --- a/llvm/test/CodeGen/X86/mmx-cvt.ll +++ b/llvm/test/CodeGen/X86/mmx-cvt.ll @@ -8,20 +8,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind { ; X86-LABEL: cvt_v2f64_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvtpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvt_v2f64_v2i32: @@ -33,9 +23,9 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind { %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0) %4 = bitcast <4 x i32> %3 to <2 x i64> %5 = extractelement <2 x i64> %4, i32 0 - %6 = bitcast i64 %5 to x86_mmx - %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) - %8 = bitcast x86_mmx %7 to i64 + %6 = bitcast i64 %5 to <1 x i64> + %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6) + %8 = bitcast <1 x i64> %7 to i64 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 store <1 x i64> %9, ptr %1 ret void @@ -44,20 +34,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind { define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind { ; X86-LABEL: cvtt_v2f64_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvtt_v2f64_v2i32: @@ -69,9 +49,9 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind { %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0) %4 = bitcast <4 x i32> %3 to <2 x i64> %5 = extractelement <2 x i64> %4, i32 0 - %6 = bitcast i64 %5 to x86_mmx - %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) - %8 = bitcast x86_mmx %7 to i64 + %6 = bitcast i64 %5 to <1 x i64> + %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6) + %8 = bitcast <1 x i64> %7 to i64 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 store <1 x i64> %9, ptr %1 ret void @@ -80,20 +60,10 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind { define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind { ; X86-LABEL: fptosi_v2f64_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: fptosi_v2f64_v2i32: @@ -103,9 +73,9 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind { ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq %3 = fptosi <2 x double> %0 to <2 x i32> - %4 = bitcast <2 x i32> %3 to x86_mmx - %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4) - %6 = bitcast x86_mmx %5 to i64 + %4 = bitcast <2 x i32> %3 to <1 x i64> + %5 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %4, <1 x i64> %4) + %6 = bitcast <1 x i64> %5 to i64 %7 = insertelement <1 x i64> undef, i64 %6, i32 0 store <1 x i64> %7, ptr %1 ret void @@ -114,20 +84,10 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind { define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind { ; X86-LABEL: cvt_v2f32_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvtps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvt_v2f32_v2i32: @@ -139,9 +99,9 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind { %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0) %4 = bitcast <4 x i32> %3 to <2 x i64> %5 = extractelement <2 x i64> %4, i32 0 - %6 = bitcast i64 %5 to x86_mmx - %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) - %8 = bitcast x86_mmx %7 to i64 + %6 = bitcast i64 %5 to <1 x i64> + %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6) + %8 = bitcast <1 x i64> %7 to i64 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 store <1 x i64> %9, ptr %1 ret void @@ -150,20 +110,10 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind { define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind { ; X86-LABEL: cvtt_v2f32_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvtt_v2f32_v2i32: @@ -175,9 +125,9 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind { %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0) %4 = bitcast <4 x i32> %3 to <2 x i64> %5 = extractelement <2 x i64> %4, i32 0 - %6 = bitcast i64 %5 to x86_mmx - %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) - %8 = bitcast x86_mmx %7 to i64 + %6 = bitcast i64 %5 to <1 x i64> + %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6) + %8 = bitcast <1 x i64> %7 to i64 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 store <1 x i64> %9, ptr %1 ret void @@ -186,20 +136,10 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind { define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind { ; X86-LABEL: fptosi_v4f32_v4i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: fptosi_v4f32_v4i32: @@ -210,9 +150,9 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind { ; X64-NEXT: retq %3 = fptosi <4 x float> %0 to <4 x i32> %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> - %5 = bitcast <2 x i32> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5) - %7 = bitcast x86_mmx %6 to i64 + %5 = bitcast <2 x i32> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5) + %7 = bitcast <1 x i64> %6 to i64 %8 = insertelement <1 x i64> undef, i64 %7, i32 0 store <1 x i64> %8, ptr %1 ret void @@ -221,20 +161,10 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind { define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind { ; X86-LABEL: fptosi_v2f32_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: fptosi_v2f32_v2i32: @@ -246,9 +176,9 @@ define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind { %3 = fptosi <4 x float> %0 to <4 x i32> %4 = bitcast <4 x i32> %3 to <2 x i64> %5 = extractelement <2 x i64> %4, i32 0 - %6 = bitcast i64 %5 to x86_mmx - %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6) - %8 = bitcast x86_mmx %7 to i64 + %6 = bitcast i64 %5 to <1 x i64> + %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6) + %8 = bitcast <1 x i64> %7 to i64 %9 = insertelement <1 x i64> undef, i64 %8, i32 0 store <1 x i64> %9, ptr %1 ret void @@ -280,9 +210,9 @@ define <2 x double> @sitofp_v2i32_v2f64(ptr) nounwind { ; X64-NEXT: movq2dq %mm0, %xmm0 ; X64-NEXT: cvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq - %2 = load x86_mmx, ptr %0, align 8 - %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2) - %4 = bitcast x86_mmx %3 to i64 + %2 = load <1 x i64>, ptr %0, align 8 + %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2) + %4 = bitcast <1 x i64> %3 to i64 %5 = insertelement <2 x i64> undef, i64 %4, i32 0 %6 = bitcast <2 x i64> %5 to <4 x i32> %7 = shufflevector <4 x i32> %6, <4 x i32> undef, <2 x i32> @@ -307,9 +237,9 @@ define <4 x float> @sitofp_v2i32_v2f32(ptr) nounwind { ; X64-NEXT: movq2dq %mm0, %xmm0 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq - %2 = load x86_mmx, ptr %0, align 8 - %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2) - %4 = bitcast x86_mmx %3 to <2 x i32> + %2 = load <1 x i64>, ptr %0, align 8 + %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2) + %4 = bitcast <1 x i64> %3 to <2 x i32> %5 = shufflevector <2 x i32> %4, <2 x i32> zeroinitializer, <4 x i32> %6 = sitofp <4 x i32> %5 to <4 x float> ret <4 x float> %6 @@ -339,9 +269,9 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind { ; X64-NEXT: movq2dq %mm0, %xmm0 ; X64-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq - %2 = load x86_mmx, ptr %0, align 8 - %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2) - %4 = bitcast x86_mmx %3 to i64 + %2 = load <1 x i64>, ptr %0, align 8 + %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2) + %4 = bitcast <1 x i64> %3 to i64 %5 = insertelement <2 x i64> undef, i64 %4, i32 0 %6 = insertelement <2 x i64> %5, i64 0, i32 1 %7 = bitcast <2 x i64> %6 to <4 x i32> @@ -349,7 +279,7 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind { ret <4 x float> %8 } -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll index 73df6be8d79890..6fe3bc4973185f 100644 --- a/llvm/test/CodeGen/X86/mmx-fold-load.ll +++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll @@ -29,13 +29,13 @@ define i64 @t0(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) define i64 @t1(ptr %a, ptr %b) nounwind { ; X86-LABEL: t1: @@ -64,13 +64,13 @@ define i64 @t1(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) define i64 @t2(ptr %a, ptr %b) nounwind { ; X86-LABEL: t2: @@ -99,13 +99,13 @@ define i64 @t2(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) define i64 @t3(ptr %a, ptr %b) nounwind { ; X86-LABEL: t3: @@ -134,13 +134,13 @@ define i64 @t3(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) define i64 @t4(ptr %a, ptr %b) nounwind { ; X86-LABEL: t4: @@ -169,13 +169,13 @@ define i64 @t4(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) define i64 @t5(ptr %a, ptr %b) nounwind { ; X86-LABEL: t5: @@ -204,13 +204,13 @@ define i64 @t5(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) define i64 @t6(ptr %a, ptr %b) nounwind { ; X86-LABEL: t6: @@ -239,13 +239,13 @@ define i64 @t6(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) define i64 @t7(ptr %a, ptr %b) nounwind { ; X86-LABEL: t7: @@ -274,22 +274,27 @@ define i64 @t7(ptr %a, ptr %b) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a, align 8 + %0 = load <1 x i64>, ptr %a, align 8 %1 = load i32, ptr %b, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %0, i32 %1) - %3 = bitcast x86_mmx %2 to i64 + %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %0, i32 %1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) +declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) -define i64 @tt0(x86_mmx %t, ptr %q) nounwind { +define i64 @tt0(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt0: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddb (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -301,28 +306,34 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt0: ; X64: # %bb.0: # %entry -; X64-NEXT: paddb (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddb (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) declare void @llvm.x86.mmx.emms() -define i64 @tt1(x86_mmx %t, ptr %q) nounwind { +define i64 @tt1(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt1: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddw (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -334,27 +345,33 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt1: ; X64: # %bb.0: # %entry -; X64-NEXT: paddw (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddw (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) -define i64 @tt2(x86_mmx %t, ptr %q) nounwind { +define i64 @tt2(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt2: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddd (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -366,27 +383,33 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt2: ; X64: # %bb.0: # %entry -; X64-NEXT: paddd (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddd (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) -define i64 @tt3(x86_mmx %t, ptr %q) nounwind { +define i64 @tt3(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt3: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddq (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -398,27 +421,33 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt3: ; X64: # %bb.0: # %entry -; X64-NEXT: paddq (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddq (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) -define i64 @tt4(x86_mmx %t, ptr %q) nounwind { +define i64 @tt4(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt4: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddusb (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -430,27 +459,33 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt4: ; X64: # %bb.0: # %entry -; X64-NEXT: paddusb (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddusb (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) -define i64 @tt5(x86_mmx %t, ptr %q) nounwind { +define i64 @tt5(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt5: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddusw (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -462,27 +497,33 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt5: ; X64: # %bb.0: # %entry -; X64-NEXT: paddusw (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddusw (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) -define i64 @tt6(x86_mmx %t, ptr %q) nounwind { +define i64 @tt6(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt6: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psrlw (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -494,27 +535,33 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt6: ; X64: # %bb.0: # %entry -; X64-NEXT: psrlw (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: psrlw (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) -define i64 @tt7(x86_mmx %t, ptr %q) nounwind { +define i64 @tt7(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt7: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psrld (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -526,27 +573,33 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt7: ; X64: # %bb.0: # %entry -; X64-NEXT: psrld (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: psrld (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) -define i64 @tt8(x86_mmx %t, ptr %q) nounwind { +define i64 @tt8(<1 x i64> %t, ptr %q) nounwind { ; X86-LABEL: tt8: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psrlq (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -558,18 +611,19 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt8: ; X64: # %bb.0: # %entry -; X64-NEXT: psrlq (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: psrlq (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq entry: - %v = load x86_mmx, ptr %q - %u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v) - %s = bitcast x86_mmx %u to i64 + %v = load <1 x i64>, ptr %q + %u = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %t, <1 x i64> %v) + %s = bitcast <1 x i64> %u to i64 call void @llvm.x86.mmx.emms() ret i64 %s } -declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) define void @test_psrlq_by_volatile_shift_amount(ptr %t) nounwind { ; X86-LABEL: test_psrlq_by_volatile_shift_amount: @@ -599,8 +653,8 @@ entry: call void @llvm.lifetime.start(i64 4, ptr nonnull %0) store volatile i32 1, ptr %0, align 4 %1 = load volatile i32, ptr %0, align 4 - %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx bitcast (<1 x i64> to x86_mmx), i32 %1) - store x86_mmx %2, ptr %t, align 8 + %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> , i32 %1) + store <1 x i64> %2, ptr %t, align 8 call void @llvm.lifetime.end(i64 4, ptr nonnull %0) ret void } @@ -609,28 +663,41 @@ declare void @llvm.lifetime.start(i64, ptr nocapture) declare void @llvm.lifetime.end(i64, ptr nocapture) ; Make sure we shrink this vector load and fold it. -define x86_mmx @vec_load(ptr %x) { +define <1 x i64> @vec_load(ptr %x) { ; X86-LABEL: vec_load: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1] ; X86-NEXT: paddsb %mm0, %mm0 +; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: movl (%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl ; ; X64-LABEL: vec_load: ; X64: # %bb.0: ; X64-NEXT: pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1] ; X64-NEXT: paddsb %mm0, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq %z = load <4 x float>, ptr %x %y = extractelement <4 x float> %z, i32 0 %a = insertelement <2 x float> undef, float %y, i32 0 %b = insertelement <2 x float> %a, float %y, i32 1 - %c = bitcast <2 x float> %b to x86_mmx - %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c) - ret x86_mmx %d + %c = bitcast <2 x float> %b to <1 x i64> + %d = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %c, <1 x i64> %c) + ret <1 x i64> %d } -declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/mmx-fold-zero.ll b/llvm/test/CodeGen/X86/mmx-fold-zero.ll index b2c94e3aaa3a65..a6e1275875dbc8 100644 --- a/llvm/test/CodeGen/X86/mmx-fold-zero.ll +++ b/llvm/test/CodeGen/X86/mmx-fold-zero.ll @@ -115,32 +115,32 @@ define double @mmx_zero(double, double, double, double) nounwind { ; X64-LARGE-NEXT: paddw %mm2, %mm0 ; X64-LARGE-NEXT: movq2dq %mm0, %xmm0 ; X64-LARGE-NEXT: retq - %5 = bitcast double %0 to x86_mmx - %6 = bitcast double %1 to x86_mmx - %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6) - %8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx)) - %9 = bitcast double %2 to x86_mmx - %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9) - %11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10) - %12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11) - %13 = bitcast double %3 to x86_mmx - %14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13) - %15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9) - %16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13) - %17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10) - %18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11) - %19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8) - %20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7) - %21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx)) - %22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12) - %23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15) - %24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6) - %25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16) - %26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17) - %27 = bitcast x86_mmx %26 to double + %5 = bitcast double %0 to <1 x i64> + %6 = bitcast double %1 to <1 x i64> + %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %6) + %8 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %7, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>)) + %9 = bitcast double %2 to <1 x i64> + %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %8, <1 x i64> %9) + %11 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %5, <1 x i64> %10) + %12 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %6, <1 x i64> %11) + %13 = bitcast double %3 to <1 x i64> + %14 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %12, <1 x i64> %13) + %15 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %14, <1 x i64> %9) + %16 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %15, <1 x i64> %13) + %17 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %16, <1 x i64> %10) + %18 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %17, <1 x i64> %11) + %19 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %18, <1 x i64> %8) + %20 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %19, <1 x i64> %7) + %21 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %20, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>)) + %22 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %21, <1 x i64> %12) + %23 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %22, <1 x i64> %15) + %24 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %23, <1 x i64> %6) + %25 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %24, <1 x i64> %16) + %26 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %25, <1 x i64> %17) + %27 = bitcast <1 x i64> %26 to double ret double %27 } -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/mmx-intrinsics.ll b/llvm/test/CodeGen/X86/mmx-intrinsics.ll index a43d9400cde6c8..a7b6ed416622ef 100644 --- a/llvm/test/CodeGen/X86/mmx-intrinsics.ll +++ b/llvm/test/CodeGen/X86/mmx-intrinsics.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefixes=ALL,X64 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+avx | FileCheck %s --check-prefixes=ALL,X64 -declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test1: @@ -32,24 +32,24 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phaddw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phaddw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test88: @@ -77,24 +77,24 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test88: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpgtd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpgtd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test87: @@ -122,24 +122,24 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test87: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpgtw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpgtw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test86: @@ -167,24 +167,24 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test86: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpgtb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpgtb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test85: @@ -212,24 +212,24 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test85: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpeqd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpeqd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test84: @@ -257,24 +257,24 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test84: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpeqw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpeqw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test83: @@ -302,24 +302,24 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test83: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpeqb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpeqb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test82: @@ -347,24 +347,24 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test82: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test81: @@ -392,24 +392,24 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test81: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test80: @@ -437,24 +437,24 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test80: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test79: @@ -482,24 +482,24 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test79: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test78: @@ -527,24 +527,24 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test78: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test77: @@ -572,24 +572,24 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test77: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test76: @@ -617,24 +617,24 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test76: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: packuswb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: packuswb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test75: @@ -662,24 +662,24 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test75: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: packssdw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: packssdw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test74: @@ -707,24 +707,24 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test74: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: packsswb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: packsswb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test73: @@ -754,15 +754,15 @@ define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test72: @@ -792,9 +792,9 @@ define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 @@ -825,15 +825,15 @@ define i64 @test72_2(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test71: @@ -859,13 +859,13 @@ define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to i64 + %mmx_var.i = bitcast i64 %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to i64 ret i64 %2 } -declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test70: @@ -895,9 +895,9 @@ define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 @@ -928,15 +928,15 @@ define i64 @test70_2(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test69: @@ -966,15 +966,15 @@ define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test68: @@ -1000,13 +1000,13 @@ define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to i64 + %mmx_var.i = bitcast i64 %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to i64 ret i64 %2 } -declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test67: @@ -1036,15 +1036,15 @@ define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test66: @@ -1074,9 +1074,9 @@ define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 @@ -1107,15 +1107,15 @@ define i64 @test66_2(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test65: @@ -1146,17 +1146,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test64: @@ -1187,17 +1187,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test63: @@ -1224,15 +1224,15 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx + %mmx_var.i = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test62: @@ -1263,17 +1263,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test61: @@ -1304,17 +1304,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test60: @@ -1341,15 +1341,15 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx + %mmx_var.i = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test59: @@ -1380,17 +1380,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test58: @@ -1421,17 +1421,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test56: @@ -1459,24 +1459,24 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test56: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pxor %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test55: @@ -1504,24 +1504,24 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test55: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: por %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test54: @@ -1549,24 +1549,24 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test54: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pandn %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pandn %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test53: @@ -1594,24 +1594,24 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test53: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pand %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test52: @@ -1639,18 +1639,18 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test52: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmullw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 @@ -1682,24 +1682,24 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test51: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmullw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test50: @@ -1727,24 +1727,24 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test50: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmulhw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test49: @@ -1772,24 +1772,24 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test49: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmaddwd %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test48: @@ -1817,24 +1817,24 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test48: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubusw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubusw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test47: @@ -1862,24 +1862,24 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test47: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubusb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubusb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test46: @@ -1907,24 +1907,24 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test46: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test45: @@ -1952,18 +1952,18 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test45: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubsb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubsb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 @@ -1994,17 +1994,17 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var = bitcast i64 %0 to x86_mmx + %mmx_var = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1 = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1) - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1 = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test43: @@ -2032,24 +2032,24 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test43: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test42: @@ -2077,24 +2077,24 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test42: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test41: @@ -2122,24 +2122,24 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test41: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test40: @@ -2167,24 +2167,24 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test40: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddusw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test39: @@ -2212,24 +2212,24 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test39: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddusb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test38: @@ -2257,24 +2257,24 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test38: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test37: @@ -2302,24 +2302,24 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test37: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddsb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test36: @@ -2346,15 +2346,15 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var = bitcast i64 %0 to x86_mmx + %mmx_var = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1 = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1) - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1 = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test35: @@ -2382,24 +2382,24 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test35: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddd %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test34: @@ -2427,24 +2427,24 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test34: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test33: @@ -2472,24 +2472,24 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test33: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test32: @@ -2517,22 +2517,22 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test32: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: psadbw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test31: @@ -2560,24 +2560,24 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test31: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pminsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test30: @@ -2605,24 +2605,24 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test30: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pminub %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test29: @@ -2650,24 +2650,24 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test29: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmaxsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test28: @@ -2695,24 +2695,24 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test28: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmaxub %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test27: @@ -2740,24 +2740,24 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test27: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pavgw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test26: @@ -2785,24 +2785,24 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test26: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pavgb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind +declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp { ; X86-LABEL: test25: @@ -2819,12 +2819,12 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx - tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind + %mmx_var.i = bitcast i64 %0 to <1 x i64> + tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind ret void } -declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone +declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test24: @@ -2850,12 +2850,12 @@ define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx - %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind + %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64> + %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind ret i32 %1 } -declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind +declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp { ; X86-LABEL: test23: @@ -2884,21 +2884,21 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp { ; ; X64-LABEL: test23: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: maskmovq %mm1, %mm0 +; X64-NEXT: maskmovq %mm0, %mm1 ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %n to <8 x i8> %1 = bitcast <1 x i64> %d to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind ret void } -declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test22: @@ -2926,24 +2926,24 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test22: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmulhuw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone +declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test21: @@ -2972,9 +2972,9 @@ define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %1 = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone - %3 = bitcast x86_mmx %2 to <4 x i16> + %1 = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 @@ -3005,15 +3005,15 @@ define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %1 = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone - %3 = bitcast x86_mmx %2 to <4 x i16> + %1 = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <2 x i32> %5 = extractelement <2 x i32> %4, i32 0 ret i32 %5 } -declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test20: @@ -3041,22 +3041,22 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test20: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmuludq %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone +declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test19: @@ -3081,12 +3081,12 @@ define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %1 = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone + %1 = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone ret <2 x double> %2 } -declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp { ; X86-LABEL: test18: @@ -3109,14 +3109,14 @@ define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone - %1 = bitcast x86_mmx %0 to <2 x i32> + %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone + %1 = bitcast <1 x i64> %0 to <2 x i32> %2 = bitcast <2 x i32> %1 to <1 x i64> %3 = extractelement <1 x i64> %2, i32 0 ret i64 %3 } -declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp { ; X86-LABEL: test17: @@ -3139,14 +3139,14 @@ define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone - %1 = bitcast x86_mmx %0 to <2 x i32> + %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone + %1 = bitcast <1 x i64> %0 to <2 x i32> %2 = bitcast <2 x i32> %1 to <1 x i64> %3 = extractelement <1 x i64> %2, i32 0 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test16: @@ -3173,15 +3173,15 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var = bitcast i64 %0 to x86_mmx + %mmx_var = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1 = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16) - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1 = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test15: @@ -3210,15 +3210,15 @@ define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %1 = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone - %3 = bitcast x86_mmx %2 to <2 x i32> + %1 = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test14: @@ -3247,15 +3247,15 @@ define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %1 = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone - %3 = bitcast x86_mmx %2 to <4 x i16> + %1 = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp { ; X86-LABEL: test13: @@ -3284,15 +3284,15 @@ define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp { ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %a to <8 x i8> - %1 = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone - %3 = bitcast x86_mmx %2 to <8 x i8> + %1 = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test12: @@ -3320,24 +3320,24 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test12: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psignd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psignd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %2 = bitcast <2 x i32> %1 to x86_mmx - %3 = bitcast <2 x i32> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <2 x i32> + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = bitcast <2 x i32> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <2 x i32> %6 = bitcast <2 x i32> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test11: @@ -3365,24 +3365,24 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test11: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psignw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psignw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test10: @@ -3410,24 +3410,24 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test10: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psignb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psignb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %2 = bitcast <8 x i8> %1 to x86_mmx - %3 = bitcast <8 x i8> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <8 x i8> + %2 = bitcast <8 x i8> %1 to <1 x i64> + %3 = bitcast <8 x i8> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <8 x i8> %6 = bitcast <8 x i8> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test9: @@ -3455,24 +3455,24 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test9: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pshufb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pshufb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %2 = bitcast <8 x i8> %1 to x86_mmx - %3 = bitcast <8 x i8> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <8 x i8> + %2 = bitcast <8 x i8> %1 to <1 x i64> + %3 = bitcast <8 x i8> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <8 x i8> %6 = bitcast <8 x i8> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test8: @@ -3500,24 +3500,24 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test8: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmulhrsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test7: @@ -3545,24 +3545,24 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test7: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pmaddubsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pmaddubsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %2 = bitcast <8 x i8> %1 to x86_mmx - %3 = bitcast <8 x i8> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <8 x i8> + %2 = bitcast <8 x i8> %1 to <1 x i64> + %3 = bitcast <8 x i8> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <8 x i8> %6 = bitcast <8 x i8> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test6: @@ -3590,24 +3590,24 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test6: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phsubsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phsubsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test5: @@ -3635,24 +3635,24 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test5: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phsubd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phsubd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %2 = bitcast <2 x i32> %1 to x86_mmx - %3 = bitcast <2 x i32> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <2 x i32> + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = bitcast <2 x i32> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <2 x i32> %6 = bitcast <2 x i32> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test4: @@ -3680,24 +3680,24 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test4: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phsubw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phsubw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test3: @@ -3725,24 +3725,24 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phaddsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phaddsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; X86-LABEL: test2: @@ -3770,33 +3770,49 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phaddd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phaddd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %2 = bitcast <2 x i32> %1 to x86_mmx - %3 = bitcast <2 x i32> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <2 x i32> + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = bitcast <2 x i32> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <2 x i32> %6 = bitcast <2 x i32> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind { -; ALL-LABEL: test89: -; ALL: # %bb.0: -; ALL-NEXT: cvtpi2ps %mm0, %xmm0 -; ALL-NEXT: ret{{[l|q]}} - %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b) +define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind { +; X86-LABEL: test89: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: cvtpi2ps (%esp), %xmm0 +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test89: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: cvtpi2ps %mm0, %xmm0 +; X64-NEXT: retq + %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b) ret <4 x float> %c } -declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone +declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone define void @test90() { ; ALL-LABEL: test90: @@ -3836,13 +3852,11 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq entry: - %0 = bitcast <1 x i64> %a.coerce to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2) - %2 = bitcast x86_mmx %1 to <1 x i64> - ret <1 x i64> %2 + %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2) + ret <1 x i64> %1 } -declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg) +declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg) define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind { ; X86-LABEL: test_mm_extract_pi16: @@ -3867,9 +3881,8 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind { ; X64-NEXT: pextrw $2, %mm0, %eax ; X64-NEXT: retq entry: - %0 = bitcast <1 x i64> %a.coerce to x86_mmx - %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2) + %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2) ret i32 %1 } -declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg) +declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg) diff --git a/llvm/test/CodeGen/X86/mmx-only.ll b/llvm/test/CodeGen/X86/mmx-only.ll index eab67e08b95743..8a87350a794294 100644 --- a/llvm/test/CodeGen/X86/mmx-only.ll +++ b/llvm/test/CodeGen/X86/mmx-only.ll @@ -3,7 +3,7 @@ ; Test that turning off sse doesn't turn off mmx. -declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone { ; CHECK-LABEL: @test88 @@ -11,10 +11,10 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll index fd8bd1facaf6b2..6bb564c4b757e6 100644 --- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll +++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll @@ -1,18 +1,18 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+mmx,+fma,+f16c,+avx512f -stop-after finalize-isel -o - %s | FileCheck %s ; This test ensures that the MXCSR is implicitly used by MMX FP instructions. -define x86_mmx @mxcsr_mmx(<4 x float> %a0) { +define <1 x i64> @mxcsr_mmx(<4 x float> %a0) { ; CHECK: MMX_CVTPS2PIrr %{{[0-9]}}, implicit $mxcsr ; CHECK: MMX_CVTPI2PSrr %{{[0-9]}}, killed %{{[0-9]}}, implicit $mxcsr ; CHECK: MMX_CVTTPS2PIrr killed %{{[0-9]}}, implicit $mxcsr ; CHECK: MMX_CVTPI2PDrr killed %{{[0-9]$}} ; CHECK: MMX_CVTPD2PIrr killed %{{[0-9]}}, implicit $mxcsr - %1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) - %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %1) - %3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2) - %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %3) - %5 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %4) - ret x86_mmx %5 + %1 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0) + %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %1) + %3 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %2) + %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %3) + %5 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %4) + ret <1 x i64> %5 } define half @mxcsr_f16c(float %a) { @@ -41,11 +41,11 @@ define <8 x double> @mxcsr_fma_sae(<8 x double> %a, <8 x double> %b, <8 x double ret <8 x double> %res } -declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) -declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) -declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) -declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) -declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) +declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) +declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) +declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>) +declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) +declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll index 1f273eb43c6a60..3b6ffacb0b230e 100644 --- a/llvm/test/CodeGen/X86/nontemporal.ll +++ b/llvm/test/CodeGen/X86/nontemporal.ll @@ -193,11 +193,11 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) { ; X64-NEXT: movntq %mm0, (%rsi) ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %a0 - %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3) - store x86_mmx %1, ptr %a1, align 8, !nontemporal !0 + %0 = load <1 x i64>, ptr %a0 + %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3) + store <1 x i64> %1, ptr %a1, align 8, !nontemporal !0 ret void } -declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone !0 = !{i32 1} diff --git a/llvm/test/CodeGen/X86/pr13859.ll b/llvm/test/CodeGen/X86/pr13859.ll index 9b290e6947d58d..35466478f289b1 100644 --- a/llvm/test/CodeGen/X86/pr13859.ll +++ b/llvm/test/CodeGen/X86/pr13859.ll @@ -13,8 +13,7 @@ entry: %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1 %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2 %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3 - %a40 = bitcast <4 x i16> %a39 to x86_mmx - %a41 = bitcast x86_mmx %a40 to <1 x i64> + %a40 = bitcast <4 x i16> %a39 to <1 x i64> %a47 = trunc i32 %a32 to i1 br i1 %a47, label %a48, label %a49 @@ -23,6 +22,6 @@ a48: unreachable a49: - store <1 x i64> %a41, ptr %dest, align 8 ; !!! + store <1 x i64> %a40, ptr %dest, align 8 ; !!! ret void } diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll index 45587b8c69cd40..da3246a917ea3d 100644 --- a/llvm/test/CodeGen/X86/pr23246.ll +++ b/llvm/test/CodeGen/X86/pr23246.ll @@ -6,15 +6,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; PR23246 ; We're really only interested in doing something sane with the shuffle. -define <2 x i64> @test(x86_mmx %a) #0 { +define <2 x i64> @test(<1 x i64> %a) #0 { ; CHECK-LABEL: test: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %rdi, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; CHECK-NEXT: retq entry: - %b = bitcast x86_mmx %a to <1 x i64> - %s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> + %s = shufflevector <1 x i64> %a, <1 x i64> undef, <2 x i32> ret <2 x i64> %s } diff --git a/llvm/test/CodeGen/X86/pr29222.ll b/llvm/test/CodeGen/X86/pr29222.ll index 9a38515b65594c..9814361404f2d4 100644 --- a/llvm/test/CodeGen/X86/pr29222.ll +++ b/llvm/test/CodeGen/X86/pr29222.ll @@ -32,7 +32,7 @@ define i32 @PR29222(i32) nounwind { ; X86-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X86-AVX-NEXT: packsswb %mm0, %mm0 ; X86-AVX-NEXT: movq %mm0, (%esp) -; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vpbroadcastq (%esp), %xmm0 ; X86-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax ; X86-AVX-NEXT: movl %ebp, %esp @@ -60,9 +60,9 @@ define i32 @PR29222(i32) nounwind { ; X64-AVX-NEXT: retq %2 = insertelement <2 x i32> undef, i32 %0, i32 0 %3 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer - %4 = bitcast <2 x i32> %3 to x86_mmx - %5 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %4, x86_mmx %4) - %6 = bitcast x86_mmx %5 to i64 + %4 = bitcast <2 x i32> %3 to <1 x i64> + %5 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %4, <1 x i64> %4) + %6 = bitcast <1 x i64> %5 to i64 %7 = insertelement <2 x i64> undef, i64 %6, i32 0 %8 = bitcast <2 x i64> %7 to <8 x i16> %9 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %8, <8 x i16> undef) @@ -71,5 +71,5 @@ define i32 @PR29222(i32) nounwind { ret i32 %11 } -declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll index b6022698edaeb9..0ad35309b87bb4 100644 --- a/llvm/test/CodeGen/X86/pr35982.ll +++ b/llvm/test/CodeGen/X86/pr35982.ll @@ -35,9 +35,9 @@ define float @PR35982_emms(<1 x i64>) nounwind { %2 = bitcast <1 x i64> %0 to <2 x i32> %3 = extractelement <2 x i32> %2, i32 0 %4 = extractelement <1 x i64> %0, i32 0 - %5 = bitcast i64 %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %5, x86_mmx %5) - %7 = bitcast x86_mmx %6 to <2 x i32> + %5 = bitcast i64 %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %5, <1 x i64> %5) + %7 = bitcast <1 x i64> %6 to <2 x i32> %8 = extractelement <2 x i32> %7, i32 0 tail call void @llvm.x86.mmx.emms() %9 = sitofp i32 %3 to float @@ -46,5 +46,5 @@ define float @PR35982_emms(<1 x i64>) nounwind { ret float %11 } -declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) declare void @llvm.x86.mmx.emms() diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll index 27b7ebb8381cd3..8a4308a5af64b2 100644 --- a/llvm/test/CodeGen/X86/select-mmx.ll +++ b/llvm/test/CodeGen/X86/select-mmx.ll @@ -14,15 +14,11 @@ define i64 @test47(i64 %arg) { ; ; X64-LABEL: test47: ; X64: # %bb.0: +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB0_1 -; X64-NEXT: # %bb.2: -; X64-NEXT: pxor %mm0, %mm0 -; X64-NEXT: jmp .LBB0_3 -; X64-NEXT: .LBB0_1: -; X64-NEXT: movl $7, %eax -; X64-NEXT: movd %eax, %mm0 -; X64-NEXT: .LBB0_3: +; X64-NEXT: movl $7, %ecx +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %mm0 ; X64-NEXT: psllw %mm0, %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq @@ -35,17 +31,17 @@ define i64 @test47(i64 %arg) { ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: orl 12(%ebp), %eax -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: -; X86-NEXT: pxor %mm0, %mm0 -; X86-NEXT: jmp .LBB0_3 -; X86-NEXT: .LBB0_1: ; X86-NEXT: movl $7, %eax -; X86-NEXT: movd %eax, %mm0 -; X86-NEXT: .LBB0_3: +; X86-NEXT: je .LBB0_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB0_2: +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psllw %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -55,9 +51,9 @@ define i64 @test47(i64 %arg) { ; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl %cond = icmp eq i64 %arg, 0 - %slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx) - %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct) - %retc = bitcast x86_mmx %psll to i64 + %slct = select i1 %cond, <1 x i64> bitcast (i64 7 to <1 x i64>), <1 x i64> bitcast (i64 0 to <1 x i64>) + %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct) + %retc = bitcast <1 x i64> %psll to i64 ret i64 %retc } @@ -74,13 +70,8 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) { ; X64-LABEL: test49: ; X64: # %bb.0: ; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB1_1 -; X64-NEXT: # %bb.2: -; X64-NEXT: movq %rdx, %mm0 -; X64-NEXT: jmp .LBB1_3 -; X64-NEXT: .LBB1_1: +; X64-NEXT: cmovneq %rdx, %rsi ; X64-NEXT: movq %rsi, %mm0 -; X64-NEXT: .LBB1_3: ; X64-NEXT: psllw %mm0, %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq @@ -113,13 +104,13 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) { ; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl %cond = icmp eq i64 %arg, 0 - %xmmx = bitcast i64 %x to x86_mmx - %ymmx = bitcast i64 %y to x86_mmx - %slct = select i1 %cond, x86_mmx %xmmx, x86_mmx %ymmx - %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct) - %retc = bitcast x86_mmx %psll to i64 + %xmmx = bitcast i64 %x to <1 x i64> + %ymmx = bitcast i64 %y to <1 x i64> + %slct = select i1 %cond, <1 x i64> %xmmx, <1 x i64> %ymmx + %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct) + %retc = bitcast <1 x i64> %psll to i64 ret i64 %retc } -declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll index 11ca9e2a547eef..6eb99dd6c67582 100644 --- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll +++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s -define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) { +define <1 x i64> @stack_fold_cvtpd2pi(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvtpd2pi: ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9,45 +9,47 @@ define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvtpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone - ret x86_mmx %2 + %2 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone -define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) { +define <2 x double> @stack_fold_cvtpi2pd(<1 x i64> %a0) { ; CHECK-LABEL: stack_fold_cvtpi2pd: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvtpi2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %a0) nounwind readnone + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %a0) nounwind readnone ret <2 x double> %2 } -declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone +declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone -define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) { +define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) { ; CHECK-LABEL: stack_fold_cvtpi2ps: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvtpi2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %a1) nounwind readnone + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) nounwind readnone ret <4 x float> %2 } -declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone +declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) { +define <1 x i64> @stack_fold_cvtps2pi(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtps2pi: ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -55,15 +57,15 @@ define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvtps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone - ret x86_mmx %2 + %2 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone -define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) { +define <1 x i64> @stack_fold_cvttpd2pi(<2 x double> %a0) { ; CHECK-LABEL: stack_fold_cvttpd2pi: ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -71,15 +73,15 @@ define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvttpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone - ret x86_mmx %2 + %2 = call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone -define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) { +define <1 x i64> @stack_fold_cvttps2pi(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvttps2pi: ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -87,18 +89,18 @@ define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvttps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone - ret x86_mmx %2 + %2 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone ; TODO stack_fold_movd_load ; padd forces execution on mmx -define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind { +define i32 @stack_fold_movd_store(<1 x i64> %a0) nounwind { ; CHECK-LABEL: stack_fold_movd_store: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp @@ -107,6 +109,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind { ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: paddb %mm0, %mm0 ; CHECK-NEXT: movd %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP @@ -120,8 +123,8 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind { ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq - %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0) - %2 = bitcast x86_mmx %1 to <2 x i32> + %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0) + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = extractelement <2 x i32> %2, i32 0 %4 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() ret i32 %3 @@ -130,7 +133,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind { ; TODO stack_fold_movq_load ; padd forces execution on mmx -define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind { +define i64 @stack_fold_movq_store(<1 x i64> %a0) nounwind { ; CHECK-LABEL: stack_fold_movq_store: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp @@ -139,6 +142,7 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind { ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: paddb %mm0, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP @@ -152,1171 +156,1235 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind { ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq - %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0) - %2 = bitcast x86_mmx %1 to i64 + %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0) + %2 = bitcast <1 x i64> %1 to i64 %3 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() ret i64 %2 } -define x86_mmx @stack_fold_pabsb(x86_mmx %a0) { +define <1 x i64> @stack_fold_pabsb(<1 x i64> %a0) { ; CHECK-LABEL: stack_fold_pabsb: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %a0) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pabsd(x86_mmx %a0) { +define <1 x i64> @stack_fold_pabsd(<1 x i64> %a0) { ; CHECK-LABEL: stack_fold_pabsd: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %a0) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pabsw(x86_mmx %a0) { +define <1 x i64> @stack_fold_pabsw(<1 x i64> %a0) { ; CHECK-LABEL: stack_fold_pabsw: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %a0) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone -define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_packssdw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_packssdw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: packssdw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_packsswb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_packsswb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: packsswb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: packsswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_packuswb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_packuswb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: packuswb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddq(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddsb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddsb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddsb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddusb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddusb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddusb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddusw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddusw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddusw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_paddw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_paddw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_palignr(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_palignr: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: palignr $1, %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %a, <1 x i64> %b, i8 1) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone -define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pand(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pand: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pand %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pand {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pandn(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pandn: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pandn %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pavgb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pavgb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pavgb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pavgb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pavgw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pavgw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pavgw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pavgw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pcmpeqb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pcmpeqb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpeqb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pcmpeqd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pcmpeqd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpeqd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pcmpeqw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pcmpeqw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpeqw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pcmpgtb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pcmpgtb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpgtb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pcmpgtd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pcmpgtd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpgtd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pcmpgtw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pcmpgtw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpgtw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_phaddd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_phaddd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phaddd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phaddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_phaddsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_phaddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phaddsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_phaddw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_phaddw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phaddw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phaddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_phsubd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_phsubd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phsubd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phsubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_phsubsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_phsubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phsubsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_phsubw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_phsubw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phsubw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phsubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone ; TODO stack_fold_pinsrw -define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmaddubsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmaddubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaddubsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmaddwd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmaddwd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaddwd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmaxsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmaxsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaxsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmaxub(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmaxub: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaxub %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pminsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pminsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pminsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pminsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pminub(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pminub: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pminub %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pminub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmulhrsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmulhrsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmulhrsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmulhuw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmulhuw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmulhuw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmulhw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmulhw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmulhw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmullw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmullw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmullw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmullw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pmuludq(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pmuludq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmuludq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_por(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_por: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: por %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psadbw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psadbw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psadbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pshufb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pshufb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload -; CHECK-NEXT: pshufb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pshufw(x86_mmx %a) { +define <1 x i64> @stack_fold_pshufw(<1 x i64> %a) { ; CHECK-LABEL: stack_fold_pshufw: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pshufw $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload ; CHECK-NEXT: # mm0 = mem[1,0,0,0] -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %a, i8 1) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone +declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone -define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) { +define <1 x i64> @stack_fold_psignb(<1 x i64> %a0, <1 x i64> %a1) { ; CHECK-LABEL: stack_fold_psignb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psignb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psignb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) { +define <1 x i64> @stack_fold_psignd(<1 x i64> %a0, <1 x i64> %a1) { ; CHECK-LABEL: stack_fold_psignd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psignd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psignd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) { +define <1 x i64> @stack_fold_psignw(<1 x i64> %a0, <1 x i64> %a1) { ; CHECK-LABEL: stack_fold_psignw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psignw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psignw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pslld(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pslld %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pslld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psllq(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psllq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psllq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psllw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psllw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psllw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psrad(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrad %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrad {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psraw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psraw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psraw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psrld(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrld %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psrlq(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrlq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrlq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psrlw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrlw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrlw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubq(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubsb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubsb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubsb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubsw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubusb(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubusb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubusb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubusw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubusw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubusw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_psubw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_psubw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_punpckhbw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_punpckhbw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_punpckhdq(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_punpckhdq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[1],mem[1] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_punpckhwd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_punpckhwd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[2],mem[2],mm0[3],mem[3] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_punpcklbw(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_punpcklbw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_punpckldq(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_punpckldq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[0],mem[0] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_punpcklwd(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_punpcklwd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[0],mem[0],mm0[1],mem[1] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone -define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) { +define <1 x i64> @stack_fold_pxor(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: stack_fold_pxor: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pxor %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pxor {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq - %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() - %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone - ret x86_mmx %2 + %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() + %2 = call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %a, <1 x i64> %b) nounwind readnone + ret <1 x i64> %2 } -declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll index 672b4591316ce8..cd375c04168818 100644 --- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll @@ -20,9 +20,9 @@ define i32 @test0(ptr %v4) nounwind { entry: %v5 = load <1 x i64>, ptr %v4, align 8 %v12 = bitcast <1 x i64> %v5 to <4 x i16> - %v13 = bitcast <4 x i16> %v12 to x86_mmx - %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18) - %v15 = bitcast x86_mmx %v14 to <4 x i16> + %v13 = bitcast <4 x i16> %v12 to <1 x i64> + %v14 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v13, i8 -18) + %v15 = bitcast <1 x i64> %v14 to <4 x i16> %v16 = bitcast <4 x i16> %v15 to <1 x i64> %v17 = extractelement <1 x i64> %v16, i32 0 %v18 = bitcast i64 %v17 to <2 x i32> @@ -52,12 +52,12 @@ entry: %0 = load i32, ptr %ptr, align 4 %1 = insertelement <2 x i32> undef, i32 %0, i32 0 %2 = insertelement <2 x i32> %1, i32 0, i32 1 - %3 = bitcast <2 x i32> %2 to x86_mmx - %4 = bitcast x86_mmx %3 to i64 + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = bitcast <1 x i64> %3 to i64 %5 = bitcast i64 %4 to <4 x i16> - %6 = bitcast <4 x i16> %5 to x86_mmx - %7 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %6, i8 -24) - %8 = bitcast x86_mmx %7 to <4 x i16> + %6 = bitcast <4 x i16> %5 to <1 x i64> + %7 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %6, i8 -24) + %8 = bitcast <1 x i64> %7 to <4 x i16> %9 = bitcast <4 x i16> %8 to <1 x i64> %10 = extractelement <1 x i64> %9, i32 0 %11 = bitcast i64 %10 to <2 x i32> @@ -82,9 +82,9 @@ define i32 @test2(ptr nocapture readonly %ptr) nounwind { ; X64-NEXT: emms ; X64-NEXT: retq entry: - %0 = load x86_mmx, ptr %ptr, align 8 - %1 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %0, i8 -24) - %2 = bitcast x86_mmx %1 to <4 x i16> + %0 = load <1 x i64>, ptr %ptr, align 8 + %1 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %0, i8 -24) + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 %5 = bitcast i64 %4 to <2 x i32> @@ -93,40 +93,39 @@ entry: ret i32 %6 } -define i32 @test3(x86_mmx %a) nounwind { +define i32 @test3(<1 x i64> %a) nounwind { ; X86-LABEL: test3: ; X86: # %bb.0: -; X86-NEXT: movd %mm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: test3: ; X64: # %bb.0: -; X64-NEXT: movd %mm0, %eax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq - %tmp0 = bitcast x86_mmx %a to <2 x i32> + %tmp0 = bitcast <1 x i64> %a to <2 x i32> %tmp1 = extractelement <2 x i32> %tmp0, i32 0 ret i32 %tmp1 } ; Verify we don't muck with extractelts from the upper lane. -define i32 @test4(x86_mmx %a) nounwind { +define i32 @test4(<1 x i64> %a) nounwind { ; X86-LABEL: test4: ; X86: # %bb.0: -; X86-NEXT: movq2dq %mm0, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: test4: ; X64: # %bb.0: -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq - %tmp0 = bitcast x86_mmx %a to <2 x i32> + %tmp0 = bitcast <1 x i64> %a to <2 x i32> %tmp1 = extractelement <2 x i32> %tmp0, i32 1 ret i32 %tmp1 } -declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) +declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) declare void @llvm.x86.mmx.emms() diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll index aa8a394f33a8bb..91743898545ee1 100644 --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -25,8 +25,8 @@ define void @t1(i32 %a, ptr %P) nounwind { %tmp12 = shl i32 %a, 12 %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 - %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx - store x86_mmx %tmp23, ptr %P + %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64> + store <1 x i64> %tmp23, ptr %P ret void } diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll index cea047453de43e..67473febf28c77 100644 --- a/llvm/test/CodeGen/X86/vec_insert-7.ll +++ b/llvm/test/CodeGen/X86/vec_insert-7.ll @@ -5,21 +5,20 @@ ; MMX insertelement is not available; these are promoted to xmm. ; (Without SSE they are split to two ints, and the code is much better.) -define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { +define <1 x i64> @mmx_movzl(<1 x i64> %x) nounwind { ; X86-LABEL: mmx_movzl: ; X86: ## %bb.0: ; X86-NEXT: movl $32, %eax -; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; ; X64-LABEL: mmx_movzl: ; X64: ## %bb.0: ; X64-NEXT: movl $32, %eax -; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: retq - %tmp = bitcast x86_mmx %x to <2 x i32> + %tmp = bitcast <1 x i64> %x to <2 x i32> %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1 - %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx - ret x86_mmx %tmp9 + %tmp9 = bitcast <2 x i32> %tmp8 to <1 x i64> + ret <1 x i64> %tmp9 } diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll index f561a2a20e194f..f95b34685211d7 100644 --- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll @@ -3,22 +3,22 @@ ; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64 ; This is not an MMX operation; promoted to xmm. -define x86_mmx @t0(i32 %A) nounwind { +define <1 x i64> @t0(i32 %A) nounwind { ; X86-LABEL: t0: ; X86: ## %bb.0: -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 -; X86-NEXT: pxor %mm0, %mm0 -; X86-NEXT: punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0] +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: t0: ; X64: ## %bb.0: ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X64-NEXT: psllq $32, %xmm0 +; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: retq %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 - %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx - ret x86_mmx %tmp4 + %tmp4 = bitcast <2 x i32> %tmp3 to <1 x i64> + ret <1 x i64> %tmp4 } define <8 x i8> @t1(i8 zeroext %x) nounwind { diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll index 709be6534d777d..60800673ed2dd4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll @@ -52,9 +52,9 @@ entry: %tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16> %tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 > %tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8> - %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx - %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx - tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, ptr null) + %tmp556 = bitcast <8 x i8> %tmp555 to <1 x i64> + %tmp557 = bitcast <8 x i8> zeroinitializer to <1 x i64> + tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp557, <1 x i64> %tmp556, ptr null) ret void } @@ -115,19 +115,19 @@ define <4 x float> @pr35869() nounwind { ; X64-NEXT: punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1] ; X64-NEXT: cvtpi2ps %mm0, %xmm0 ; X64-NEXT: retq - %1 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx bitcast (<8 x i8> to x86_mmx), x86_mmx bitcast (<8 x i8> zeroinitializer to x86_mmx)) - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx bitcast (<4 x i16> zeroinitializer to x86_mmx), x86_mmx %1) - %3 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %1, x86_mmx %2) - %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, x86_mmx %3) + %1 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> bitcast (<8 x i8> to <1 x i64>), <1 x i64> bitcast (<8 x i8> zeroinitializer to <1 x i64>)) + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> bitcast (<4 x i16> zeroinitializer to <1 x i64>), <1 x i64> %1) + %3 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %1, <1 x i64> %2) + %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, <1 x i64> %3) %5 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> - %6 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %1, x86_mmx %2) - %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, x86_mmx %6) + %6 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %1, <1 x i64> %2) + %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, <1 x i64> %6) ret <4 x float> %7 } -declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) -declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) -declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) -declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) +declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) +declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) +declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) diff --git a/llvm/test/CodeGen/X86/x86-64-psub.ll b/llvm/test/CodeGen/X86/x86-64-psub.ll index 9817d798fd4bf8..4c11464075ec92 100644 --- a/llvm/test/CodeGen/X86/x86-64-psub.ll +++ b/llvm/test/CodeGen/X86/x86-64-psub.ll @@ -32,11 +32,11 @@ entry: %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0 %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0 %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8> - %3 = bitcast <8 x i8> %2 to x86_mmx + %3 = bitcast <8 x i8> %2 to <1 x i64> %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8> - %5 = bitcast <8 x i8> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %3, x86_mmx %5) nounwind - %7 = bitcast x86_mmx %6 to <8 x i8> + %5 = bitcast <8 x i8> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %3, <1 x i64> %5) nounwind + %7 = bitcast <1 x i64> %6 to <8 x i8> %8 = bitcast <8 x i8> %7 to <1 x i64> %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0 ret i64 %retval.0.extract.i15 @@ -66,11 +66,11 @@ entry: %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0 %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0 %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16> - %3 = bitcast <4 x i16> %2 to x86_mmx + %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16> - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %3, x86_mmx %5) nounwind - %7 = bitcast x86_mmx %6 to <4 x i16> + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %3, <1 x i64> %5) nounwind + %7 = bitcast <1 x i64> %6 to <4 x i16> %8 = bitcast <4 x i16> %7 to <1 x i64> %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0 ret i64 %retval.0.extract.i15 @@ -100,11 +100,11 @@ entry: %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0 %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0 %2 = bitcast <1 x i64> %__m1.0.insert.i to <2 x i32> - %3 = bitcast <2 x i32> %2 to x86_mmx + %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = bitcast <1 x i64> %__m2.0.insert.i to <2 x i32> - %5 = bitcast <2 x i32> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %3, x86_mmx %5) nounwind - %7 = bitcast x86_mmx %6 to <2 x i32> + %5 = bitcast <2 x i32> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %3, <1 x i64> %5) nounwind + %7 = bitcast <1 x i64> %6 to <2 x i32> %8 = bitcast <2 x i32> %7 to <1 x i64> %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0 ret i64 %retval.0.extract.i15 @@ -134,11 +134,11 @@ entry: %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0 %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0 %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8> - %3 = bitcast <8 x i8> %2 to x86_mmx + %3 = bitcast <8 x i8> %2 to <1 x i64> %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8> - %5 = bitcast <8 x i8> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %3, x86_mmx %5) nounwind - %7 = bitcast x86_mmx %6 to <8 x i8> + %5 = bitcast <8 x i8> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %3, <1 x i64> %5) nounwind + %7 = bitcast <1 x i64> %6 to <8 x i8> %8 = bitcast <8 x i8> %7 to <1 x i64> %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0 ret i64 %retval.0.extract.i15 @@ -168,11 +168,11 @@ entry: %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0 %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0 %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16> - %3 = bitcast <4 x i16> %2 to x86_mmx + %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16> - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %3, x86_mmx %5) nounwind - %7 = bitcast x86_mmx %6 to <4 x i16> + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %3, <1 x i64> %5) nounwind + %7 = bitcast <1 x i64> %6 to <4 x i16> %8 = bitcast <4 x i16> %7 to <1 x i64> %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0 ret i64 %retval.0.extract.i15 @@ -202,11 +202,11 @@ entry: %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0 %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0 %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8> - %3 = bitcast <8 x i8> %2 to x86_mmx + %3 = bitcast <8 x i8> %2 to <1 x i64> %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8> - %5 = bitcast <8 x i8> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %3, x86_mmx %5) nounwind - %7 = bitcast x86_mmx %6 to <8 x i8> + %5 = bitcast <8 x i8> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %3, <1 x i64> %5) nounwind + %7 = bitcast <1 x i64> %6 to <8 x i8> %8 = bitcast <8 x i8> %7 to <1 x i64> %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0 ret i64 %retval.0.extract.i15 @@ -236,26 +236,26 @@ entry: %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0 %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0 %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16> - %3 = bitcast <4 x i16> %2 to x86_mmx + %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16> - %5 = bitcast <4 x i16> %4 to x86_mmx - %6 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %3, x86_mmx %5) nounwind - %7 = bitcast x86_mmx %6 to <4 x i16> + %5 = bitcast <4 x i16> %4 to <1 x i64> + %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %3, <1 x i64> %5) nounwind + %7 = bitcast <1 x i64> %6 to <4 x i16> %8 = bitcast <4 x i16> %7 to <1 x i64> %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0 ret i64 %retval.0.extract.i15 } -declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll index 39b2b6225d8b10..1d2e38eb5e63d8 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test1( @@ -17,16 +17,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5:[0-9]+]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -35,16 +35,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test88( @@ -57,16 +57,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -75,16 +75,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test87( @@ -97,16 +97,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -115,16 +115,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test86( @@ -137,16 +137,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -155,16 +155,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test85( @@ -177,16 +177,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -195,16 +195,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test84( @@ -217,16 +217,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -235,16 +235,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test83( @@ -257,16 +257,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -275,16 +275,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test82( @@ -297,16 +297,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -315,16 +315,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test81( @@ -337,16 +337,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -355,16 +355,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test80( @@ -377,16 +377,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -395,16 +395,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test79( @@ -417,16 +417,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -435,16 +435,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test78( @@ -457,16 +457,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -475,16 +475,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test77( @@ -497,16 +497,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -515,16 +515,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test76( @@ -537,23 +537,22 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx -; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]]) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]]) +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 @@ -564,16 +563,16 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test75( @@ -586,23 +585,22 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <2 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <2 x i1> [[TMP12]] to <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to x86_mmx -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to x86_mmx -; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[TMP14]], x86_mmx [[TMP15]]) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]]) +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <4 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <4 x i16> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 @@ -613,16 +611,16 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test74( @@ -635,23 +633,22 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx -; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]]) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]]) +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 @@ -662,16 +659,16 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone define i64 @test73(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test73( @@ -681,17 +678,15 @@ define i64 @test73(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -699,15 +694,15 @@ define i64 @test73(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone define i64 @test72(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test72( @@ -717,17 +712,15 @@ define i64 @test72(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -735,9 +728,9 @@ define i64 @test72(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 @@ -751,17 +744,15 @@ define i64 @test72_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -769,15 +760,15 @@ define i64 @test72_2(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone define i64 @test71(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test71( @@ -787,25 +778,25 @@ define i64 @test71(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[TMP6]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[TMP2]], i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP4]] ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to i64 + %mmx_var.i = bitcast i64 %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to i64 ret i64 %2 } -declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone define i64 @test70(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test70( @@ -815,17 +806,15 @@ define i64 @test70(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -833,9 +822,9 @@ define i64 @test70(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 @@ -849,17 +838,15 @@ define i64 @test70_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -867,15 +854,15 @@ define i64 @test70_2(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone define i64 @test69(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test69( @@ -885,17 +872,15 @@ define i64 @test69(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -903,15 +888,15 @@ define i64 @test69(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone define i64 @test68(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test68( @@ -921,25 +906,25 @@ define i64 @test68(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[TMP6]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[TMP2]], i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP4]] ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to i64 + %mmx_var.i = bitcast i64 %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to i64 ret i64 %2 } -declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone define i64 @test67(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test67( @@ -949,17 +934,15 @@ define i64 @test67(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -967,15 +950,15 @@ define i64 @test67(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <2 x i32> %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone define i64 @test66(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test66( @@ -985,17 +968,15 @@ define i64 @test66(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -1003,9 +984,9 @@ define i64 @test66(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 @@ -1019,17 +1000,15 @@ define i64 @test66_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -1037,15 +1016,15 @@ define i64 @test66_2(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind - %2 = bitcast x86_mmx %1 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> + %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind + %2 = bitcast <1 x i64> %1 to <4 x i16> %3 = bitcast <4 x i16> %2 to <1 x i64> %4 = extractelement <1 x i64> %3, i32 0 ret i64 %4 } -declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test65( @@ -1056,20 +1035,21 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1079,17 +1059,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test64( @@ -1100,20 +1080,21 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1123,17 +1104,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test63( @@ -1144,32 +1125,35 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64 ; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP5]] ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx + %mmx_var.i = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test62( @@ -1180,20 +1164,21 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1203,17 +1188,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test61( @@ -1224,20 +1209,21 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1247,17 +1233,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test60( @@ -1268,32 +1254,35 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64 ; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP5]] ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx + %mmx_var.i = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test59( @@ -1304,20 +1293,21 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1327,17 +1317,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx + %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test58( @@ -1348,20 +1338,21 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1371,17 +1362,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx + %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1.i = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var1.i = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test56( @@ -1394,16 +1385,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1412,16 +1403,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test55( @@ -1434,16 +1425,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1452,16 +1443,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test54( @@ -1474,16 +1465,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1492,16 +1483,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test53( @@ -1514,16 +1505,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1532,16 +1523,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test52( @@ -1554,16 +1545,16 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1572,10 +1563,10 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 @@ -1592,16 +1583,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1610,16 +1601,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test50( @@ -1632,16 +1623,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1650,16 +1641,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test49( @@ -1672,38 +1663,38 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP12]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP18]] ; entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test48( @@ -1716,16 +1707,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1734,16 +1725,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test47( @@ -1756,16 +1747,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1774,16 +1765,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test46( @@ -1796,16 +1787,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1814,16 +1805,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test45( @@ -1836,16 +1827,16 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1854,10 +1845,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 @@ -1872,29 +1863,32 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 ; CHECK-NEXT: store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var = bitcast i64 %0 to x86_mmx + %mmx_var = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1 = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1) - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1 = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test43( @@ -1907,16 +1901,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1925,16 +1919,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test42( @@ -1947,16 +1941,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1965,16 +1959,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test41( @@ -1987,16 +1981,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2005,16 +1999,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test40( @@ -2027,16 +2021,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2045,16 +2039,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test39( @@ -2067,16 +2061,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2085,16 +2079,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test38( @@ -2107,16 +2101,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2125,16 +2119,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test37( @@ -2147,16 +2141,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2165,16 +2159,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test36( @@ -2185,27 +2179,30 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 ; CHECK-NEXT: store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var = bitcast i64 %0 to x86_mmx + %mmx_var = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1 = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1) - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1 = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test35( @@ -2218,16 +2215,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2236,16 +2233,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <2 x i32> + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test34( @@ -2258,16 +2255,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2276,16 +2273,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test33( @@ -2298,16 +2295,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2316,16 +2313,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test32( @@ -2338,30 +2335,33 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP16]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 48 -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64 [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP17]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP14]] to i64 +; CHECK-NEXT: store i64 [[TMP15]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test31( @@ -2374,16 +2374,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2392,16 +2392,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test30( @@ -2414,16 +2414,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2432,16 +2432,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test29( @@ -2454,16 +2454,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2472,16 +2472,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test28( @@ -2494,16 +2494,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2512,16 +2512,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test27( @@ -2534,16 +2534,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2552,16 +2552,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test26( @@ -2574,16 +2574,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2592,16 +2592,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <8 x i8> + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind +declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 { ; CHECK-LABEL: define void @test25( @@ -2612,26 +2612,29 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr [[TMP6]], align 1 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]] -; CHECK: 3: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]] ; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], x86_mmx [[MMX_VAR_I]]) #[[ATTR2]] +; CHECK: 8: +; CHECK-NEXT: tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], <1 x i64> [[MMX_VAR_I]]) #[[ATTR2]] ; CHECK-NEXT: ret void ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var.i = bitcast i64 %0 to x86_mmx - tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind + %mmx_var.i = bitcast i64 %0 to <1 x i64> + tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind ret void } -declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone +declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone define i32 @test24(<1 x i64> %a) #0 { ; CHECK-LABEL: define i32 @test24( @@ -2641,26 +2644,27 @@ define i32 @test24(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP6]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[MMX_VAR_I]]) #[[ATTR2]] +; CHECK: 6: +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[MMX_VAR_I]]) #[[ATTR2]] ; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP1]] ; entry: %0 = bitcast <1 x i64> %a to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx - %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind + %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64> + %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind ret i32 %1 } -declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind +declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0 { ; CHECK-LABEL: define void @test23( @@ -2674,33 +2678,35 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[D]] to <8 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP9]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]] -; CHECK: 9: +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]] +; CHECK: 11: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: tail call void @llvm.x86.mmx.maskmovq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]] +; CHECK: 12: +; CHECK-NEXT: tail call void @llvm.x86.mmx.maskmovq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]] ; CHECK-NEXT: ret void ; entry: %0 = bitcast <1 x i64> %n to <8 x i8> %1 = bitcast <1 x i64> %d to <8 x i8> - %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx - %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx - tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind + %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64> + %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64> + tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind ret void } -declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test22( @@ -2713,16 +2719,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2731,16 +2737,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx - %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to <4 x i16> + %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64> + %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone +declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone define i64 @test21(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test21( @@ -2750,16 +2756,17 @@ define i64 @test21(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK: 7: +; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 @@ -2767,9 +2774,9 @@ define i64 @test21(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %1 = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone - %3 = bitcast x86_mmx %2 to <4 x i16> + %1 = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 @@ -2783,16 +2790,17 @@ define i32 @test21_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK: 7: +; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 @@ -2800,15 +2808,15 @@ define i32 @test21_2(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %1 = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone - %3 = bitcast x86_mmx %2 to <4 x i16> + %1 = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <2 x i32> %5 = extractelement <2 x i32> %4, i32 0 ret i32 %5 } -declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test20( @@ -2821,27 +2829,28 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx - %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind - %3 = bitcast x86_mmx %2 to i64 + %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64> + %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone +declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone define <2 x double> @test19(<1 x i64> %a) #0 { ; CHECK-LABEL: define <2 x double> @test19( @@ -2851,26 +2860,27 @@ define <2 x double> @test19(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP7]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP7]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP5]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx [[TMP1]]) #[[ATTR5]] +; CHECK: 7: +; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> [[TMP8]]) #[[ATTR5]] ; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[TMP2]] ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %1 = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone + %1 = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone ret <2 x double> %2 } -declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone define i64 @test18(<2 x double> %a) #0 { ; CHECK-LABEL: define i64 @test18( @@ -2885,22 +2895,22 @@ define i64 @test18(<2 x double> %a) #0 { ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable ; CHECK: 3: -; CHECK-NEXT: [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: - %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone - %1 = bitcast x86_mmx %0 to <2 x i32> + %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone + %1 = bitcast <1 x i64> %0 to <2 x i32> %2 = bitcast <2 x i32> %1 to <1 x i64> %3 = extractelement <1 x i64> %2, i32 0 ret i64 %3 } -declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone define i64 @test17(<2 x double> %a) #0 { ; CHECK-LABEL: define i64 @test17( @@ -2915,22 +2925,22 @@ define i64 @test17(<2 x double> %a) #0 { ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable ; CHECK: 3: -; CHECK-NEXT: [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: - %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone - %1 = bitcast x86_mmx %0 to <2 x i32> + %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone + %1 = bitcast <1 x i64> %0 to <2 x i32> %2 = bitcast <2 x i32> %1 to <1 x i64> %3 = extractelement <1 x i64> %2, i32 0 ret i64 %3 } -declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test16( @@ -2941,34 +2951,38 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[_MSPROP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP5]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP12]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]], i8 16) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]], i8 16) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: %0 = extractelement <1 x i64> %a, i32 0 - %mmx_var = bitcast i64 %0 to x86_mmx + %mmx_var = bitcast i64 %0 to <1 x i64> %1 = extractelement <1 x i64> %b, i32 0 - %mmx_var1 = bitcast i64 %1 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16) - %3 = bitcast x86_mmx %2 to i64 + %mmx_var1 = bitcast i64 %1 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16) + %3 = bitcast <1 x i64> %2 to i64 ret i64 %3 } -declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone define i64 @test15(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test15( @@ -2978,13 +2992,13 @@ define i64 @test15(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx [[TMP1]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> [[TMP1]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP6]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2992,15 +3006,15 @@ define i64 @test15(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <2 x i32> - %1 = bitcast <2 x i32> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone - %3 = bitcast x86_mmx %2 to <2 x i32> + %1 = bitcast <2 x i32> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone + %3 = bitcast <1 x i64> %2 to <2 x i32> %4 = bitcast <2 x i32> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone define i64 @test14(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test14( @@ -3010,13 +3024,13 @@ define i64 @test14(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx [[TMP1]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> [[TMP1]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP6]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -3024,15 +3038,15 @@ define i64 @test14(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> - %1 = bitcast <4 x i16> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone - %3 = bitcast x86_mmx %2 to <4 x i16> + %1 = bitcast <4 x i16> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone + %3 = bitcast <1 x i64> %2 to <4 x i16> %4 = bitcast <4 x i16> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone define i64 @test13(<1 x i64> %a) #0 { ; CHECK-LABEL: define i64 @test13( @@ -3042,13 +3056,13 @@ define i64 @test13(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx [[TMP1]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> [[TMP1]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <8 x i8> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -3056,15 +3070,15 @@ define i64 @test13(<1 x i64> %a) #0 { ; entry: %0 = bitcast <1 x i64> %a to <8 x i8> - %1 = bitcast <8 x i8> %0 to x86_mmx - %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone - %3 = bitcast x86_mmx %2 to <8 x i8> + %1 = bitcast <8 x i8> %0 to <1 x i64> + %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone + %3 = bitcast <1 x i64> %2 to <8 x i8> %4 = bitcast <8 x i8> %3 to <1 x i64> %5 = extractelement <1 x i64> %4, i32 0 ret i64 %5 } -declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test12( @@ -3077,16 +3091,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3095,16 +3109,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %2 = bitcast <2 x i32> %1 to x86_mmx - %3 = bitcast <2 x i32> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <2 x i32> + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = bitcast <2 x i32> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <2 x i32> %6 = bitcast <2 x i32> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test11( @@ -3117,16 +3131,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3135,16 +3149,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test10( @@ -3157,16 +3171,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3175,16 +3189,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %2 = bitcast <8 x i8> %1 to x86_mmx - %3 = bitcast <8 x i8> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <8 x i8> + %2 = bitcast <8 x i8> %1 to <1 x i64> + %3 = bitcast <8 x i8> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <8 x i8> %6 = bitcast <8 x i8> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test9( @@ -3197,16 +3211,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3215,16 +3229,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %2 = bitcast <8 x i8> %1 to x86_mmx - %3 = bitcast <8 x i8> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <8 x i8> + %2 = bitcast <8 x i8> %1 to <1 x i64> + %3 = bitcast <8 x i8> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <8 x i8> %6 = bitcast <8 x i8> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test8( @@ -3237,16 +3251,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3255,16 +3269,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test7( @@ -3277,18 +3291,18 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP21]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <4 x i16> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64 [[TMP14]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -3299,16 +3313,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <8 x i8> %1 = bitcast <1 x i64> %a to <8 x i8> - %2 = bitcast <8 x i8> %1 to x86_mmx - %3 = bitcast <8 x i8> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <8 x i8> + %2 = bitcast <8 x i8> %1 to <1 x i64> + %3 = bitcast <8 x i8> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <8 x i8> %6 = bitcast <8 x i8> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test6( @@ -3321,16 +3335,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3339,16 +3353,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test5( @@ -3361,16 +3375,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3379,16 +3393,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %2 = bitcast <2 x i32> %1 to x86_mmx - %3 = bitcast <2 x i32> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <2 x i32> + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = bitcast <2 x i32> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <2 x i32> %6 = bitcast <2 x i32> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test4( @@ -3401,16 +3415,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3419,16 +3433,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test3( @@ -3441,16 +3455,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3459,16 +3473,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <4 x i16> %1 = bitcast <1 x i64> %a to <4 x i16> - %2 = bitcast <4 x i16> %1 to x86_mmx - %3 = bitcast <4 x i16> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <4 x i16> + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = bitcast <4 x i16> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <4 x i16> %6 = bitcast <4 x i16> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-LABEL: define i64 @test2( @@ -3481,16 +3495,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3499,43 +3513,44 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 { entry: %0 = bitcast <1 x i64> %b to <2 x i32> %1 = bitcast <1 x i64> %a to <2 x i32> - %2 = bitcast <2 x i32> %1 to x86_mmx - %3 = bitcast <2 x i32> %0 to x86_mmx - %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone - %5 = bitcast x86_mmx %4 to <2 x i32> + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = bitcast <2 x i32> %0 to <1 x i64> + %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone + %5 = bitcast <1 x i64> %4 to <2 x i32> %6 = bitcast <2 x i32> %5 to <1 x i64> %7 = extractelement <1 x i64> %6, i32 0 ret i64 %7 } -define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 { +define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind #0 { ; ALL-LABEL: test89: ; ALL: # %bb.0: ; ALL-NEXT: cvtpi2ps %mm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} ; CHECK-LABEL: define <4 x float> @test89( -; CHECK-SAME: <4 x float> [[A:%.*]], x86_mmx [[B:%.*]]) #[[ATTR4:[0-9]+]] { +; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP4]] to i64 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], x86_mmx [[B]]) +; CHECK: 6: +; CHECK-NEXT: [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], <1 x i64> [[B]]) ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[C]] ; - %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b) + %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b) ret <4 x float> %c } -declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone +declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone define void @test90() #0 { ; ALL-LABEL: test90: @@ -3562,28 +3577,24 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 { ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP4:%.*]], !prof [[PROF0]] +; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP1:%.*]] = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx [[TMP0]], i32 [[D]], i32 2) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to <1 x i64> +; CHECK: 4: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[A_COERCE]], i32 [[D]], i32 2) ; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret <1 x i64> [[TMP2]] +; CHECK-NEXT: ret <1 x i64> [[TMP9]] ; entry: - %0 = bitcast <1 x i64> %a.coerce to x86_mmx - %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2) - %2 = bitcast x86_mmx %1 to <1 x i64> - ret <1 x i64> %2 + %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2) + ret <1 x i64> %1 } -declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg) +declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg) define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 { ; CHECK-LABEL: define i32 @test_mm_extract_pi16( @@ -3592,24 +3603,22 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]] -; CHECK: 3: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP3:%.*]], !prof [[PROF0]] +; CHECK: 2: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx [[TMP0]], i32 2) +; CHECK: 3: +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[A_COERCE]], i32 2) ; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP1]] ; entry: - %0 = bitcast <1 x i64> %a.coerce to x86_mmx - %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2) + %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2) ret i32 %1 } -declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg) +declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg) attributes #0 = { sanitize_memory } ;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll index 34c3ca3706eeb5..1e412cbac84c2a 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt %s -S -passes=msan 2>&1 | FileCheck %s ; new test from upstream -; XFAIL: * target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll index 93d89b40bce343..9da7f01806a7a1 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll @@ -5,9 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "x86_64-unknown-linux-gnu" declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone -declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone -declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory { entry: @@ -23,19 +23,19 @@ entry: ; CHECK: ret <4 x i32> -define x86_mmx @Test_ssse3_pmadd_ub_sw(x86_mmx %a, x86_mmx %b) sanitize_memory { +define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_memory { entry: - %c = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind - ret x86_mmx %c + %c = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind + ret <1 x i64> %c } ; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw( -; CHECK: or i64 -; CHECK: bitcast i64 {{.*}} to <4 x i16> +; CHECK: or <1 x i64> +; CHECK: bitcast <1 x i64> {{.*}} to <4 x i16> ; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer ; CHECK: sext <4 x i1> {{.*}} to <4 x i16> -; CHECK: bitcast <4 x i16> {{.*}} to i64 -; CHECK: ret x86_mmx +; CHECK: bitcast <4 x i16> {{.*}} to <1 x i64> +; CHECK: ret <1 x i64> define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory { @@ -52,15 +52,15 @@ define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_mem ; CHECK: ret <2 x i64> -define x86_mmx @Test_x86_mmx_psad_bw(x86_mmx %a, x86_mmx %b) sanitize_memory { +define <1 x i64> @Test_x86_mmx_psad_bw(<1 x i64> %a, <1 x i64> %b) sanitize_memory { entry: - %c = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind - ret x86_mmx %c + %c = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind + ret <1 x i64> %c } ; CHECK-LABEL: @Test_x86_mmx_psad_bw( -; CHECK: or i64 +; CHECK: or <1 x i64> ; CHECK: icmp ne i64 ; CHECK: sext i1 {{.*}} to i64 ; CHECK: lshr i64 {{.*}}, 48 -; CHECK: ret x86_mmx +; CHECK: ret <1 x i64> diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll index 52acbfe0a0e779..e9202700b1df74 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll @@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu" declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone -declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone +declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone ; Single argument vector conversion. @@ -27,12 +27,12 @@ entry: ; CHECK: store i32 0, {{.*}} @__msan_retval_tls ; CHECK: ret i32 -; x86_mmx packed vector conversion. +; <1 x i64> packed vector conversion. -define x86_mmx @test_cvtps2pi(<4 x float> %value) sanitize_memory { +define <1 x i64> @test_cvtps2pi(<4 x float> %value) sanitize_memory { entry: - %0 = tail call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %value) - ret x86_mmx %0 + %0 = tail call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %value) + ret <1 x i64> %0 } ; CHECK-LABEL: @test_cvtps2pi @@ -42,9 +42,9 @@ entry: ; CHECK: icmp ne {{.*}}[[S]], 0 ; CHECK: br ; CHECK: call void @__msan_warning_noreturn() -; CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi -; CHECK: store i64 0, {{.*}} @__msan_retval_tls -; CHECK: ret x86_mmx +; CHECK: call <1 x i64> @llvm.x86.sse.cvtps2pi +; CHECK: store <1 x i64> zeroinitializer, {{.*}} @__msan_retval_tls +; CHECK: ret <1 x i64> ; avx512 rounding conversion. diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll index 775c29791aefed..0f6f1fe4a7dcad 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll @@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu" declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone -declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone +declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory { entry: @@ -40,22 +40,21 @@ entry: ; CHECK: ret <32 x i8> -define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory { +define <1 x i64> @Test_mmx_packuswb(<1 x i64> %a, <1 x i64> %b) sanitize_memory { entry: - %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind - ret x86_mmx %c + %c = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind + ret <1 x i64> %c } ; CHECK-LABEL: @Test_mmx_packuswb( -; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16> -; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16> +; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16> +; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16> ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16> ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16> -; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx -; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx -; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}} -; CHECK-DAG: bitcast x86_mmx {{.*}} to i64 -; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}} -; CHECK: ret x86_mmx +; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64> +; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64> +; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packsswb({{.*}} +; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packuswb({{.*}} +; CHECK: ret <1 x i64> diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll index 7514e6ea74bb46..461d6cb9217d8f 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll @@ -6,7 +6,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) +declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) @@ -18,10 +18,10 @@ declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) define i64 @test_mmx(i64 %x.coerce, i64 %y.coerce) sanitize_memory { entry: %0 = bitcast i64 %x.coerce to <2 x i32> - %1 = bitcast <2 x i32> %0 to x86_mmx - %2 = bitcast i64 %y.coerce to x86_mmx - %3 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %1, x86_mmx %2) - %4 = bitcast x86_mmx %3 to <2 x i32> + %1 = bitcast <2 x i32> %0 to <1 x i64> + %2 = bitcast i64 %y.coerce to <1 x i64> + %3 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %1, <1 x i64> %2) + %4 = bitcast <1 x i64> %3 to <2 x i32> %5 = bitcast <2 x i32> %4 to <1 x i64> %6 = extractelement <1 x i64> %5, i32 0 ret i64 %6 @@ -29,11 +29,11 @@ entry: ; CHECK-LABEL: @test_mmx ; CHECK: = icmp ne i64 {{.*}}, 0 -; CHECK: [[C:%.*]] = sext i1 {{.*}} to i64 -; CHECK: [[A:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d( -; CHECK: [[B:%.*]] = bitcast x86_mmx {{.*}}[[A]] to i64 -; CHECK: = or i64 {{.*}}[[B]], {{.*}}[[C]] -; CHECK: call x86_mmx @llvm.x86.mmx.psll.d( +; CHECK: [[B:%.*]] = sext i1 {{.*}} to i64 +; CHECK: [[C:%.*]] = bitcast i64 [[B]] to <1 x i64> +; CHECK: [[A:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d( +; CHECK: = or <1 x i64> {{.*}}[[A]], {{.*}}[[C]] +; CHECK: call <1 x i64> @llvm.x86.mmx.psll.d( ; CHECK: ret i64 diff --git a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll index 990e2810d851e4..2e9da9f2b5b779 100644 --- a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll +++ b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll @@ -11,7 +11,7 @@ define void @f_Ym(i64 %m.coerce) { ; CHECK: ## InlineAsm End entry: - %0 = tail call x86_mmx asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() + %0 = tail call <1 x i64> asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() ret void } diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll index 63114288fc5810..76e2f9af2ed1de 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll @@ -7,12 +7,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; DemandedBits - MOVMSK zeros the upper bits of the result. ; -define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) { +define i32 @test_upper_x86_mmx_pmovmskb(<1 x i64> %a0) { ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[A0:%.*]]) ; CHECK-NEXT: ret i32 [[TMP1]] ; - %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0) + %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0) %2 = and i32 %1, 255 ret i32 %2 } @@ -87,11 +87,11 @@ define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) { ; DemandedBits - If we don't use the lower bits then we just return zero. ; -define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) { +define i32 @test_lower_x86_mmx_pmovmskb(<1 x i64> %a0) { ; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb( ; CHECK-NEXT: ret i32 0 ; - %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0) + %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0) %2 = and i32 %1, -256 ret i32 %2 } @@ -151,7 +151,7 @@ define i32 @undef_x86_mmx_pmovmskb() { ; CHECK-LABEL: @undef_x86_mmx_pmovmskb( ; CHECK-NEXT: ret i32 0 ; - %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef) + %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> undef) ret i32 %1 } @@ -207,16 +207,6 @@ define i32 @undef_x86_avx2_pmovmskb() { ; Constant Folding (ZERO -> ZERO) ; -define i32 @zero_x86_mmx_pmovmskb() { -; CHECK-LABEL: @zero_x86_mmx_pmovmskb( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx)) -; CHECK-NEXT: ret i32 [[TMP1]] -; - %1 = bitcast <1 x i64> zeroinitializer to x86_mmx - %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1) - ret i32 %2 -} - define i32 @zero_x86_sse_movmsk_ps() { ; CHECK-LABEL: @zero_x86_sse_movmsk_ps( ; CHECK-NEXT: ret i32 0 @@ -271,11 +261,11 @@ define i32 @zero_x86_avx2_pmovmskb() { define i32 @fold_x86_mmx_pmovmskb() { ; CHECK-LABEL: @fold_x86_mmx_pmovmskb( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> to x86_mmx)) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> splat (i64 18084223940296448)) ; CHECK-NEXT: ret i32 [[TMP1]] ; - %1 = bitcast <8 x i8> to x86_mmx - %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1) + %1 = bitcast <8 x i8> to <1 x i64> + %2 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %1) ret i32 %2 } @@ -447,7 +437,7 @@ define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) { ret i32 %r } -declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) +declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll index 38a7391a1a1e37..d4ec9e3aae6795 100644 --- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll @@ -38,38 +38,6 @@ define <1 x i64> @d(i64 %y) { ret <1 x i64> %c } -define x86_mmx @e(<1 x i64> %y) { -; CHECK-LABEL: @e( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0 -; CHECK-NEXT: [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: ret x86_mmx [[C]] -; - %c = bitcast <1 x i64> %y to x86_mmx - ret x86_mmx %c -} - -define <1 x i64> @f(x86_mmx %y) { -; CHECK-LABEL: @f( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64 -; CHECK-NEXT: [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0 -; CHECK-NEXT: ret <1 x i64> [[C]] -; - %c = bitcast x86_mmx %y to <1 x i64> - ret <1 x i64> %c -} - -define double @g(x86_mmx %x) { -; CHECK-LABEL: @g( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double -; CHECK-NEXT: ret double [[TMP0]] -; -entry: - %0 = bitcast x86_mmx %x to <1 x i64> - %1 = bitcast <1 x i64> %0 to double - ret double %1 -} - ; FP source is ok. define <3 x i64> @bitcast_inselt_undef(double %x, i32 %idx) { @@ -137,19 +105,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) { ret <3 x i64> %i } -; Negative test - source type must be scalar - -define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) { -; CHECK-LABEL: @bitcast_inselt_undef_from_mmx( -; CHECK-NEXT: [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64 -; CHECK-NEXT: [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]] -; CHECK-NEXT: ret <3 x i64> [[I]] -; - %xb = bitcast x86_mmx %x to i64 - %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx - ret <3 x i64> %i -} - ; Reduce number of casts define <2 x i64> @PR45748(double %x, double %y) { diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll index 8b8325b1472637..f787b3c4cc9ac2 100644 --- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll +++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll @@ -38,37 +38,6 @@ define <1 x i64> @d(i64 %y) { ret <1 x i64> %c } -define x86_mmx @e(<1 x i64> %y) { -; CHECK-LABEL: @e( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0 -; CHECK-NEXT: [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: ret x86_mmx [[C]] -; - %c = bitcast <1 x i64> %y to x86_mmx - ret x86_mmx %c -} - -define <1 x i64> @f(x86_mmx %y) { -; CHECK-LABEL: @f( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64 -; CHECK-NEXT: [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0 -; CHECK-NEXT: ret <1 x i64> [[C]] -; - %c = bitcast x86_mmx %y to <1 x i64> - ret <1 x i64> %c -} - -define double @g(x86_mmx %x) { -; CHECK-LABEL: @g( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double -; CHECK-NEXT: ret double [[TMP0]] -; -entry: - %0 = bitcast x86_mmx %x to <1 x i64> - %1 = bitcast <1 x i64> %0 to double - ret double %1 -} ; FP source is ok. @@ -137,19 +106,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) { ret <3 x i64> %i } -; Negative test - source type must be scalar - -define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) { -; CHECK-LABEL: @bitcast_inselt_undef_from_mmx( -; CHECK-NEXT: [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64 -; CHECK-NEXT: [[I:%.*]] = insertelement <3 x i64> undef, i64 [[XB]], i32 [[IDX:%.*]] -; CHECK-NEXT: ret <3 x i64> [[I]] -; - %xb = bitcast x86_mmx %x to i64 - %i = insertelement <3 x i64> undef, i64 %xb, i32 %idx - ret <3 x i64> %i -} - ; Reduce number of casts define <2 x i64> @PR45748(double %x, double %y) { diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll index 1534d5a4b3aaeb..ca748a9483e9b2 100644 --- a/llvm/test/Transforms/InstCombine/cast.ll +++ b/llvm/test/Transforms/InstCombine/cast.ll @@ -937,27 +937,6 @@ define float @test2c() { ret float extractelement (<2 x float> bitcast (double bitcast (<2 x float> to double) to <2 x float>), i32 0) } -define i64 @test_mmx(<2 x i32> %x) { -; ALL-LABEL: @test_mmx( -; ALL-NEXT: [[C:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64 -; ALL-NEXT: ret i64 [[C]] -; - %A = bitcast <2 x i32> %x to x86_mmx - %B = bitcast x86_mmx %A to <2 x i32> - %C = bitcast <2 x i32> %B to i64 - ret i64 %C -} - -define i64 @test_mmx_const(<2 x i32> %c) { -; ALL-LABEL: @test_mmx_const( -; ALL-NEXT: ret i64 0 -; - %A = bitcast <2 x i32> zeroinitializer to x86_mmx - %B = bitcast x86_mmx %A to <2 x i32> - %C = bitcast <2 x i32> %B to i64 - ret i64 %C -} - ; PR12514 define i1 @test67(i1 %a, i32 %b) { ; ALL-LABEL: @test67( diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll index bce07b07756209..c383ff7a90ded2 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll @@ -12,18 +12,5 @@ define <2 x ptr> @test_gep() { ; CHECK-NEXT: ret <2 x ptr> ; %A = getelementptr [1 x %rec8], ptr @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer - %B = bitcast <2 x ptr> %A to <2 x ptr> - ret <2 x ptr> %B -} - -; Testcase that verify the cast-of-cast when the outer/second cast is to a -; vector type. - -define <4 x i16> @test_mmx_const() { -; CHECK-LABEL: @test_mmx_const( -; CHECK-NEXT: ret <4 x i16> zeroinitializer -; - %A = bitcast <2 x i32> zeroinitializer to x86_mmx - %B = bitcast x86_mmx %A to <4 x i16> - ret <4 x i16> %B + ret <2 x ptr> %A } diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll index d4c49faf91b091..dd75560e25ceda 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll @@ -335,19 +335,6 @@ define { i64, i64 } @test_load_struct() { ret { i64, i64 } %v } -@m64 = internal constant [2 x i64] zeroinitializer -@idx = external global i32 - -; This should not try to create an x86_mmx null value. -define x86_mmx @load_mmx() { -; CHECK-LABEL: @load_mmx( -; CHECK-NEXT: [[TEMP:%.*]] = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)), align 8 -; CHECK-NEXT: ret x86_mmx [[TEMP]] -; - %temp = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)) - ret x86_mmx %temp -} - @g_offset = external global i64 @g_neg_one_vec = constant <4 x i8> diff --git a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll b/llvm/test/Transforms/LoopUnroll/X86/mmx.ll deleted file mode 100644 index b460b79d0640aa..00000000000000 --- a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll +++ /dev/null @@ -1,35 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define x86_mmx @f() #0 { -; CHECK-LABEL: define x86_mmx @f -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_7:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[PHI]], 7 -; CHECK-NEXT: [[ADD_7]] = add i32 [[PHI]], 8 -; CHECK-NEXT: [[CMP_7:%.*]] = icmp eq i32 [[ADD_6]], 0 -; CHECK-NEXT: br i1 [[CMP_7]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: [[RET:%.*]] = phi x86_mmx [ undef, [[FOR_BODY]] ] -; CHECK-NEXT: ret x86_mmx [[RET]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %phi = phi i32 [ 1, %entry ], [ %add, %for.body ] - %add = add i32 %phi, 1 - %cmp = icmp eq i32 %phi, 0 - br i1 %cmp, label %exit, label %for.body - -exit: ; preds = %for.body - %ret = phi x86_mmx [ undef, %for.body ] - ret x86_mmx %ret -} - -attributes #0 = { "target-cpu"="x86-64" } diff --git a/llvm/test/Transforms/SCCP/crash.ll b/llvm/test/Transforms/SCCP/crash.ll index 8f8ad44db437b0..47d9329f6f03da 100644 --- a/llvm/test/Transforms/SCCP/crash.ll +++ b/llvm/test/Transforms/SCCP/crash.ll @@ -28,7 +28,7 @@ define i32 @test2([4 x i32] %A) { ret i32 %B } -define x86_mmx @test3() { - %load = load x86_mmx, ptr null - ret x86_mmx %load +define <1 x i64> @test3() { + %load = load <1 x i64>, ptr null + ret <1 x i64> %load } diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll index 1bf1ad7ee934a5..4eb6a7107dad30 100644 --- a/llvm/test/Transforms/SROA/pr57796.ll +++ b/llvm/test/Transforms/SROA/pr57796.ll @@ -17,9 +17,9 @@ define void @foo() { ; CHECK-NEXT: [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]]) ; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32 ; CHECK-NEXT: [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0) -; CHECK-NEXT: store x86_mmx [[TMP2]], ptr @A, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP1]], i8 0) +; CHECK-NEXT: store <1 x i64> [[TMP2]], ptr @A, align 8 ; CHECK-NEXT: ret void ; entry: @@ -29,13 +29,13 @@ entry: %call.i = call align 32 ptr @value_set_type(ptr align 32 %ref.tmp.i) %0 = load <32 x i8>, ptr %call.i, align 32 store <32 x i8> %0, ptr %ref.tmp, align 32 - %1 = load x86_mmx, ptr %ref.tmp, align 32 - %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 0) - store x86_mmx %2, ptr @A, align 8 + %1 = load <1 x i64>, ptr %ref.tmp, align 32 + %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 0) + store <1 x i64> %2, ptr @A, align 8 ret void } -declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8 immarg) +declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8 immarg) declare dso_local void @value_create(ptr sret(%struct.Value) align 32) diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll index fe70ba082cb4c1..f835b98b243456 100644 --- a/llvm/test/Verifier/atomics.ll +++ b/llvm/test/Verifier/atomics.ll @@ -3,12 +3,12 @@ ; CHECK: atomic store operand must have integer, pointer, or floating point type! ; CHECK: atomic load operand must have integer, pointer, or floating point type! -define void @foo(ptr %P, x86_mmx %v) { - store atomic x86_mmx %v, ptr %P unordered, align 8 +define void @foo(ptr %P, <1 x i64> %v) { + store atomic <1 x i64> %v, ptr %P unordered, align 8 ret void } -define x86_mmx @bar(ptr %P) { - %v = load atomic x86_mmx, ptr %P unordered, align 8 - ret x86_mmx %v +define <1 x i64> @bar(ptr %P) { + %v = load atomic <1 x i64>, ptr %P unordered, align 8 + ret <1 x i64> %v } diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp index 09d4421ea8a407..4173e49e60a046 100644 --- a/llvm/tools/llvm-c-test/echo.cpp +++ b/llvm/tools/llvm-c-test/echo.cpp @@ -153,8 +153,6 @@ struct TypeCloner { return LLVMMetadataTypeInContext(Ctx); case LLVMX86_AMXTypeKind: return LLVMX86AMXTypeInContext(Ctx); - case LLVMX86_MMXTypeKind: - return LLVMX86MMXTypeInContext(Ctx); case LLVMTokenTypeKind: return LLVMTokenTypeInContext(Ctx); case LLVMTargetExtTypeKind: { diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp index 277938a611fd0e..e44b6023fff231 100644 --- a/llvm/tools/llvm-stress/llvm-stress.cpp +++ b/llvm/tools/llvm-stress/llvm-stress.cpp @@ -173,8 +173,6 @@ struct Modifier { Ty = Type::getX86_FP80Ty(Context); else if (Arg == "ppc_fp128") Ty = Type::getPPC_FP128Ty(Context); - else if (Arg == "x86_mmx") - Ty = Type::getX86_MMXTy(Context); else if (Arg.starts_with("i")) { unsigned N = 0; Arg.drop_front().getAsInteger(10, N); @@ -294,11 +292,7 @@ struct Modifier { /// Pick a random vector type. Type *pickVectorType(VectorType *VTy = nullptr) { - // Vectors of x86mmx are illegal; keep trying till we get something else. - Type *Ty; - do { - Ty = pickScalarType(); - } while (Ty->isX86_MMXTy()); + Type *Ty = pickScalarType(); if (VTy) return VectorType::get(Ty, VTy->getElementCount()); diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp index f517761efa3c0b..b730d665c78132 100644 --- a/llvm/unittests/IR/InstructionsTest.cpp +++ b/llvm/unittests/IR/InstructionsTest.cpp @@ -205,7 +205,6 @@ TEST(InstructionsTest, CastInst) { Type *Int64Ty = Type::getInt64Ty(C); Type *V8x8Ty = FixedVectorType::get(Int8Ty, 8); Type *V8x64Ty = FixedVectorType::get(Int64Ty, 8); - Type *X86MMXTy = Type::getX86_MMXTy(C); Type *HalfTy = Type::getHalfTy(C); Type *FloatTy = Type::getFloatTy(C); @@ -248,9 +247,6 @@ TEST(InstructionsTest, CastInst) { EXPECT_EQ(CastInst::Trunc, CastInst::getCastOpcode(c64, true, V8x8Ty, true)); EXPECT_EQ(CastInst::SExt, CastInst::getCastOpcode(c8, true, V8x64Ty, true)); - EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, X86MMXTy)); - EXPECT_FALSE(CastInst::isBitCastable(X86MMXTy, V8x8Ty)); - EXPECT_FALSE(CastInst::isBitCastable(Int64Ty, X86MMXTy)); EXPECT_FALSE(CastInst::isBitCastable(V8x64Ty, V8x8Ty)); EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, V8x64Ty)); @@ -1809,7 +1805,7 @@ TEST(InstructionsTest, AllocaInst) { %A = alloca i32, i32 1 %B = alloca i32, i32 4 %C = alloca i32, i32 %n - %D = alloca <8 x double> + %D = alloca double %E = alloca %F = alloca [2 x half] %G = alloca [2 x [3 x i128]] @@ -1835,7 +1831,8 @@ TEST(InstructionsTest, AllocaInst) { EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::getFixed(32)); EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::getFixed(128)); EXPECT_FALSE(C.getAllocationSizeInBits(DL)); - EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(512)); + EXPECT_EQ(DL.getTypeSizeInBits(D.getAllocatedType()), TypeSize::getFixed(64)); + EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(64)); EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::getScalable(512)); EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::getFixed(32)); EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::getFixed(768)); diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md index bc0f484108facf..fadc81b567b4e4 100644 --- a/mlir/docs/Dialects/LLVM.md +++ b/mlir/docs/Dialects/LLVM.md @@ -240,8 +240,6 @@ dialect as there is no corresponding built-in type. The following non-parametric types derived from the LLVM IR are available in the LLVM dialect: -- `!llvm.x86_mmx` (`LLVMX86MMXType`) - value held in an MMX register on x86 - machine. - `!llvm.ppc_fp128` (`LLVMPPCFP128Type`) - 128-bit floating-point value (two 64 bits). - `!llvm.token` (`LLVMTokenType`) - a non-inspectable value associated with an diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h index f3d211b4aebb17..2ea589a7c4c3bd 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h @@ -67,7 +67,6 @@ namespace LLVM { DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType, "llvm.void"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type, "llvm.ppc_fp128"); -DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType, "llvm.x86_mmx"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType, "llvm.token"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType, "llvm.label"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata"); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 929918e5c3e76f..6b2d8943bf4885 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -3527,7 +3527,6 @@ void LLVMDialect::initialize() { // clang-format off addTypes(type) .Case([&](Type) { return "void"; }) .Case([&](Type) { return "ppc_fp128"; }) - .Case([&](Type) { return "x86_mmx"; }) .Case([&](Type) { return "token"; }) .Case([&](Type) { return "label"; }) .Case([&](Type) { return "metadata"; }) @@ -310,7 +309,6 @@ static Type dispatchParse(AsmParser &parser, bool allowAny = true) { return StringSwitch>(key) .Case("void", [&] { return LLVMVoidType::get(ctx); }) .Case("ppc_fp128", [&] { return LLVMPPCFP128Type::get(ctx); }) - .Case("x86_mmx", [&] { return LLVMX86MMXType::get(ctx); }) .Case("token", [&] { return LLVMTokenType::get(ctx); }) .Case("label", [&] { return LLVMLabelType::get(ctx); }) .Case("metadata", [&] { return LLVMMetadataType::get(ctx); }) diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp index c00214ede206dd..ea990ca7aefbe0 100644 --- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp +++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp @@ -67,8 +67,6 @@ class TypeFromLLVMIRTranslatorImpl { return LLVM::LLVMX86AMXType::get(&context); if (type->isPPC_FP128Ty()) return LLVM::LLVMPPCFP128Type::get(&context); - if (type->isX86_MMXTy()) - return LLVM::LLVMX86MMXType::get(&context); if (type->isLabelTy()) return LLVM::LLVMLabelType::get(&context); if (type->isMetadataTy()) diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp index be200acac28e7e..c7a533eddce84b 100644 --- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp +++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp @@ -58,9 +58,6 @@ class TypeToLLVMIRTranslatorImpl { .Case([this](LLVM::LLVMPPCFP128Type) { return llvm::Type::getPPC_FP128Ty(context); }) - .Case([this](LLVM::LLVMX86MMXType) { - return llvm::Type::getX86_MMXTy(context); - }) .Case([this](LLVM::LLVMTokenType) { return llvm::Type::getTokenTy(context); }) diff --git a/offload/hostexec/services/execute_service.cpp b/offload/hostexec/services/execute_service.cpp index 489f57e737a975..2e0e6741b456e8 100644 --- a/offload/hostexec/services/execute_service.cpp +++ b/offload/hostexec/services/execute_service.cpp @@ -417,7 +417,6 @@ enum TypeID { VoidTyID, ///< type with no size LabelTyID, ///< Labels MetadataTyID, ///< Metadata - X86_MMXTyID, ///< MMX vectors (64 bits, X86 specific) X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific) TokenTyID, ///< Tokens @@ -528,7 +527,6 @@ static service_rc hostrpc_pfBuildValist(hostrpc_ValistExt_t *valist, case StructTyID: ///< 13: Structures case FunctionTyID: ///< 12: Functions case TokenTyID: ///< 10: Tokens - case X86_MMXTyID: ///< 9: MMX vectors (64 bits, X86 specific) case MetadataTyID: ///< 8: Metadata case LabelTyID: ///< 7: Labels case PPC_FP128TyID: ///< 6: 128-bit floating point type (two 64-bits, @@ -766,7 +764,6 @@ static service_rc hostrpc_build_vargs_array(int NumArgs, char *keyptr, case StructTyID: ///< 13: Structures case FunctionTyID: ///< 12: Functions case TokenTyID: ///< 10: Tokens - case X86_MMXTyID: ///< 9: MMX vectors (64 bits, X86 specific) case MetadataTyID: ///< 8: Metadata case LabelTyID: ///< 7: Labels case PPC_FP128TyID: ///< 6: 128-bit floating point type (two 64-bits, diff --git a/revert_patches.txt b/revert_patches.txt index 85a42628e187c8..710f0990bf25fa 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -1,9 +1,4 @@ --- -Revert: breaks devIO for openmp -dfeb3991fb48 Remove the `x86_mmx` IR type. (#98505) -b7e4fba6e5dc Cleanup x86_mmx after removing IR type (#100646) (Reason: dependent on dfeb3991fb48) -Ron: still broken 11-1-24 ---- Revert: breaks build of hipCUB commit 55783bd0 [HIP] fix host min/max in header (#82956) Sam