From bf94321b3247ab5eb2d898e23530a1bebb307e12 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Wed, 10 Jul 2024 00:49:25 -0400 Subject: [PATCH 1/8] Clang: don't unnecessarily convert inline-asm operands to x86mmx in IR. The SelectionDAG asm-lowering code can already handle conversion of other vector types to MMX if needed. --- clang/lib/CodeGen/Targets/X86.cpp | 13 ------------- clang/test/CodeGen/X86/mmx-inline-asm.c | 2 +- clang/test/CodeGen/asm-inout.c | 6 +++--- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++--- llvm/test/CodeGen/X86/mmx-inlineasm.ll | 20 ++++++++++++++++++++ 5 files changed, 27 insertions(+), 20 deletions(-) create mode 100644 llvm/test/CodeGen/X86/mmx-inlineasm.ll diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp index 1dc3172a6bdf9b..8913b188f6aec3 100644 --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -27,19 +27,6 @@ bool IsX86_MMXType(llvm::Type *IRType) { static llvm::Type* X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF, StringRef Constraint, llvm::Type* Ty) { - bool IsMMXCons = llvm::StringSwitch(Constraint) - .Cases("y", "&y", "^Ym", true) - .Default(false); - if (IsMMXCons && Ty->isVectorTy()) { - if (cast(Ty)->getPrimitiveSizeInBits().getFixedValue() != - 64) { - // Invalid MMX constraint - return nullptr; - } - - return llvm::Type::getX86_MMXTy(CGF.getLLVMContext()); - } - if (Constraint == "k") { llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGF.getLLVMContext()); return llvm::FixedVectorType::get(Int1Ty, Ty->getScalarSizeInBits()); diff --git a/clang/test/CodeGen/X86/mmx-inline-asm.c b/clang/test/CodeGen/X86/mmx-inline-asm.c index 19c24a3a91e14f..a0702c7f780d12 100644 --- a/clang/test/CodeGen/X86/mmx-inline-asm.c +++ b/clang/test/CodeGen/X86/mmx-inline-asm.c @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s #include -// CHECK: { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx } +// CHECK: { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } void foo(long long fill) { __m64 vfill = _mm_cvtsi64_m64(fill); diff --git a/clang/test/CodeGen/asm-inout.c b/clang/test/CodeGen/asm-inout.c index 1383a421efbc26..6d40451b778d91 100644 --- a/clang/test/CodeGen/asm-inout.c +++ b/clang/test/CodeGen/asm-inout.c @@ -38,11 +38,11 @@ int test4(volatile int *addr) { return (int)oldval; } -// This should have both inputs be of type x86_mmx. +// This should have both inputs be of type <1 x i64>. // CHECK: @test5 typedef long long __m64 __attribute__((__vector_size__(8))); __m64 test5(__m64 __A, __m64 __B) { - // CHECK: call x86_mmx asm "pmulhuw $1, $0\0A\09", "=y,y,0,~{dirflag},~{fpsr},~{flags}"(x86_mmx %{{.*}}, x86_mmx %{{.*}}) + // CHECK: call <1 x i64> asm "pmulhuw $1, $0\0A\09", "=y,y,0,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %{{.*}}, <1 x i64> %{{.*}}) asm ("pmulhuw %1, %0\n\t" : "+y" (__A) : "y" (__B)); return __A; } @@ -51,7 +51,7 @@ __m64 test5(__m64 __A, __m64 __B) { int test6(void) { typedef unsigned char __attribute__((vector_size(8))) _m64u8; _m64u8 __attribute__((aligned(16))) Mu8_0, __attribute__((aligned(16))) Mu8_1; - // CHECK: call x86_mmx asm "nop", "=y,0,~{dirflag},~{fpsr},~{flags}"(x86_mmx %1) + // CHECK: call <8 x i8> asm "nop", "=y,0,~{dirflag},~{fpsr},~{flags}"(<8 x i8> %0) asm ("nop" : "=y"(Mu8_1 ) : "0"(Mu8_0 )); return 0; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e116285d043c0c..fa26849e0bc5a1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58236,7 +58236,7 @@ X86TargetLowering::getSingleConstraintMatchWeight( Wt = CW_SpecificReg; break; case 'y': - if (Ty->isX86_MMXTy() && Subtarget.hasMMX()) + if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX()) Wt = CW_SpecificReg; break; case 'Y': @@ -58259,8 +58259,8 @@ X86TargetLowering::getSingleConstraintMatchWeight( return CW_Invalid; // Any MMX reg case 'm': - if (Ty->isX86_MMXTy() && Subtarget.hasMMX()) - return Wt; + if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX()) + return CW_SpecificReg; return CW_Invalid; // Any SSE reg when ISA >= SSE2, same as 'x' case 'i': diff --git a/llvm/test/CodeGen/X86/mmx-inlineasm.ll b/llvm/test/CodeGen/X86/mmx-inlineasm.ll new file mode 100644 index 00000000000000..5a15600a4b3121 --- /dev/null +++ b/llvm/test/CodeGen/X86/mmx-inlineasm.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx | FileCheck %s + +;; Verify that the mmx 'y' constraint works with arbitrary IR types. +define <2 x i32> @test_mmx_asm(<2 x i32> %a) nounwind { +; CHECK-LABEL: test_mmx_asm: +; CHECK: # %bb.0: +; CHECK-NEXT: movdq2q %xmm0, %mm0 +; CHECK-NEXT: #APP +; CHECK-NEXT: # %mm0 = %mm0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # %mm0 = %mm0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: retq + %1 = tail call i64 asm sideeffect "# $0 = $1", "=y,y"(<2 x i32> %a) + %2 = tail call <2 x i32> asm sideeffect "# $0 = $1", "=y,y"(i64 %1) + ret <2 x i32> %2 +} From da88aa73ad9a34dcc6745279a00edd4ee0c59501 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Wed, 10 Jul 2024 19:01:55 -0400 Subject: [PATCH 2/8] Remove the `x86_mmx` IR type. It is now translated to `<1 x i64>`, which allows the removal of a bunch of special casing. This changes the ABI of any LLVM IR function with `x86_mmx` arguments or returns: instead of passing in mmx registers, it will now pass via integer registers. However, the real-world incompatibility generated by this is minimal, since Clang never uses the x86_mmx type -- it lowers `__m64` to either `<1 x i64>` or `double`, depending on ABI. This change does _not_ eliminate the SelectionDAG `MVT::x86mmx` type. That no longer corresponds to an IR type, and is used only by MMX intrinsics and inline-asm operands. In order to correctly handle the MMX intrinsics, a hack has been added to `SelectionDAGBuilder::visitTargetIntrinsic`, because there's no generic way to specify a custom translation from LLVM IR type to SelectionDAG type for an intrinsic lowering. (This may be a short-lived hack, if all the MMX intrinsics can be removed in upcoming changes.) --- clang/lib/CodeGen/CGBuiltin.cpp | 4 +- llvm/bindings/ocaml/llvm/llvm.mli | 4 - llvm/bindings/ocaml/llvm/llvm_ocaml.c | 5 - llvm/include/llvm-c/Core.h | 48 +- llvm/include/llvm/IR/DataLayout.h | 1 - llvm/include/llvm/IR/Type.h | 12 +- llvm/lib/Analysis/ConstantFolding.cpp | 8 +- llvm/lib/AsmParser/LLLexer.cpp | 3 +- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 4 +- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 5 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 17 + llvm/lib/CodeGen/ValueTypes.cpp | 7 +- llvm/lib/IR/AsmWriter.cpp | 5 +- llvm/lib/IR/ConstantFold.cpp | 2 +- llvm/lib/IR/Core.cpp | 8 - llvm/lib/IR/DataLayout.cpp | 1 - llvm/lib/IR/Function.cpp | 14 +- llvm/lib/IR/Instructions.cpp | 9 - llvm/lib/IR/LLVMContextImpl.cpp | 6 +- llvm/lib/IR/LLVMContextImpl.h | 2 +- llvm/lib/IR/Type.cpp | 15 +- .../DirectX/DXILWriter/DXILBitcodeWriter.cpp | 3 - .../Hexagon/HexagonTargetObjectFile.cpp | 1 - llvm/lib/Target/X86/X86CallingConv.td | 34 - .../Target/X86/X86InstCombineIntrinsic.cpp | 8 +- .../IPO/DeadArgumentElimination.cpp | 6 +- .../InstCombine/InstCombineCasts.cpp | 7 - .../Instrumentation/MemorySanitizer.cpp | 42 +- llvm/test/Assembler/x86mmx.ll | 9 - llvm/test/Bitcode/bcanalyzer-types.ll | 6 - llvm/test/Bitcode/compatibility-3.6.ll | 2 +- llvm/test/Bitcode/compatibility-3.7.ll | 2 +- llvm/test/Bitcode/compatibility-3.8.ll | 2 +- llvm/test/Bitcode/compatibility-3.9.ll | 2 +- llvm/test/Bitcode/compatibility-4.0.ll | 2 +- llvm/test/Bitcode/compatibility-5.0.ll | 2 +- llvm/test/Bitcode/compatibility-6.0.ll | 2 +- llvm/test/Bitcode/compatibility.ll | 2 +- .../CodeGen/X86/2008-09-05-sinttofp-2xi32.ll | 21 +- llvm/test/CodeGen/X86/3dnow-intrinsics.ll | 68 +- llvm/test/CodeGen/X86/avx-vbroadcast.ll | 8 +- llvm/test/CodeGen/X86/avx2-vbroadcast.ll | 11 +- llvm/test/CodeGen/X86/fast-isel-bc.ll | 9 +- .../test/CodeGen/X86/fast-isel-nontemporal.ll | 3 +- .../CodeGen/X86/mmx-arg-passing-x86-64.ll | 15 +- llvm/test/CodeGen/X86/mmx-arg-passing.ll | 10 +- llvm/test/CodeGen/X86/mmx-bitcast-fold.ll | 2 +- llvm/test/CodeGen/X86/mmx-bitcast.ll | 4 +- llvm/test/CodeGen/X86/mmx-fold-load.ll | 107 +- llvm/test/CodeGen/X86/mmx-intrinsics.ll | 398 ++-- llvm/test/CodeGen/X86/pr23246.ll | 2 +- llvm/test/CodeGen/X86/select-mmx.ll | 35 +- llvm/test/CodeGen/X86/stack-folding-3dnow.ll | 140 +- llvm/test/CodeGen/X86/stack-folding-mmx.ll | 508 ++--- llvm/test/CodeGen/X86/vec_extract-mmx.ll | 11 +- llvm/test/CodeGen/X86/vec_insert-7.ll | 3 +- llvm/test/CodeGen/X86/vec_insert-mmx.ll | 8 +- .../MemorySanitizer/X86/mmx-intrinsics.ll | 1667 +++++++++-------- .../MemorySanitizer/vector_arith.ll | 12 +- .../MemorySanitizer/vector_cvt.ll | 6 +- .../MemorySanitizer/vector_pack.ll | 15 +- .../MemorySanitizer/vector_shift.ll | 10 +- .../Transforms/InstCombine/X86/x86-movmsk.ll | 14 +- .../bitcast-vec-canon-inseltpoison.ll | 45 - .../InstCombine/bitcast-vec-canon.ll | 44 - .../InstSimplify/ConstProp/loads.ll | 13 - llvm/test/Transforms/LoopUnroll/X86/mmx.ll | 35 - .../Transforms/SLPVectorizer/X86/bad_types.ll | 62 - llvm/test/Transforms/SROA/pr57796.ll | 6 +- llvm/tools/llvm-c-test/echo.cpp | 2 - llvm/tools/llvm-stress/llvm-stress.cpp | 8 +- llvm/unittests/IR/InstructionsTest.cpp | 9 +- mlir/docs/Dialects/LLVM.md | 2 - mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h | 1 - mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 1 - mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp | 2 - mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp | 5 +- mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp | 2 - mlir/lib/Target/LLVMIR/TypeToLLVM.cpp | 3 - mlir/test/Dialect/LLVMIR/types.mlir | 2 - mlir/test/Target/LLVMIR/llvmir-types.mlir | 2 - 81 files changed, 1777 insertions(+), 1866 deletions(-) delete mode 100644 llvm/test/Assembler/x86mmx.ll delete mode 100644 llvm/test/Transforms/LoopUnroll/X86/mmx.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 6cc0d9485720c0..36853098b118d2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14386,7 +14386,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vec_init_v4hi: case X86::BI__builtin_ia32_vec_init_v2si: return Builder.CreateBitCast(BuildVector(Ops), - llvm::Type::getX86_MMXTy(getLLVMContext())); + llvm::FixedVectorType::get(Int64Ty, 1)); case X86::BI__builtin_ia32_vec_ext_v2si: case X86::BI__builtin_ia32_vec_ext_v16qi: case X86::BI__builtin_ia32_vec_ext_v8hi: @@ -15971,7 +15971,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // 3DNow! case X86::BI__builtin_ia32_pswapdsf: case X86::BI__builtin_ia32_pswapdsi: { - llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext()); + llvm::Type *MMXTy = llvm::FixedVectorType::get(Int64Ty, 1); Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast"); llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd); return Builder.CreateCall(F, Ops, "pswapd"); diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli index c16530d3a70cb4..b8a430adf6cf2d 100644 --- a/llvm/bindings/ocaml/llvm/llvm.mli +++ b/llvm/bindings/ocaml/llvm/llvm.mli @@ -760,10 +760,6 @@ val void_type : llcontext -> lltype [llvm::Type::LabelTy]. *) val label_type : llcontext -> lltype -(** [x86_mmx_type c] returns the x86 64-bit MMX register type in the - context [c]. See [llvm::Type::X86_MMXTy]. *) -val x86_mmx_type : llcontext -> lltype - (** [type_by_name m name] returns the specified type from the current module if it exists. See the method [llvm::Module::getTypeByName] *) diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c index 4ac824cd6a98a6..5906f427e69072 100644 --- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c +++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c @@ -686,11 +686,6 @@ value llvm_label_type(value Context) { return to_val(LLVMLabelTypeInContext(Context_val(Context))); } -/* llcontext -> lltype */ -value llvm_x86_mmx_type(value Context) { - return to_val(LLVMX86MMXTypeInContext(Context_val(Context))); -} - /* llmodule -> string -> lltype option */ value llvm_type_by_name(value M, value Name) { return ptr_to_option(LLVMGetTypeByName(Module_val(M), String_val(Name))); diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 9867db4839fe19..1b18f31e3925cb 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -146,27 +146,27 @@ typedef enum { } LLVMOpcode; typedef enum { - LLVMVoidTypeKind, /**< type with no size */ - LLVMHalfTypeKind, /**< 16 bit floating point type */ - LLVMFloatTypeKind, /**< 32 bit floating point type */ - LLVMDoubleTypeKind, /**< 64 bit floating point type */ - LLVMX86_FP80TypeKind, /**< 80 bit floating point type (X87) */ - LLVMFP128TypeKind, /**< 128 bit floating point type (112-bit mantissa)*/ - LLVMPPC_FP128TypeKind, /**< 128 bit floating point type (two 64-bits) */ - LLVMLabelTypeKind, /**< Labels */ - LLVMIntegerTypeKind, /**< Arbitrary bit width integers */ - LLVMFunctionTypeKind, /**< Functions */ - LLVMStructTypeKind, /**< Structures */ - LLVMArrayTypeKind, /**< Arrays */ - LLVMPointerTypeKind, /**< Pointers */ - LLVMVectorTypeKind, /**< Fixed width SIMD vector type */ - LLVMMetadataTypeKind, /**< Metadata */ - LLVMX86_MMXTypeKind, /**< X86 MMX */ - LLVMTokenTypeKind, /**< Tokens */ - LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */ - LLVMBFloatTypeKind, /**< 16 bit brain floating point type */ - LLVMX86_AMXTypeKind, /**< X86 AMX */ - LLVMTargetExtTypeKind, /**< Target extension type */ + LLVMVoidTypeKind = 0, /**< type with no size */ + LLVMHalfTypeKind = 1, /**< 16 bit floating point type */ + LLVMFloatTypeKind = 2, /**< 32 bit floating point type */ + LLVMDoubleTypeKind = 3, /**< 64 bit floating point type */ + LLVMX86_FP80TypeKind = 4, /**< 80 bit floating point type (X87) */ + LLVMFP128TypeKind = 5, /**< 128 bit floating point type (112-bit mantissa)*/ + LLVMPPC_FP128TypeKind = 6, /**< 128 bit floating point type (two 64-bits) */ + LLVMLabelTypeKind = 7, /**< Labels */ + LLVMIntegerTypeKind = 8, /**< Arbitrary bit width integers */ + LLVMFunctionTypeKind = 9, /**< Functions */ + LLVMStructTypeKind = 10, /**< Structures */ + LLVMArrayTypeKind = 11, /**< Arrays */ + LLVMPointerTypeKind = 12, /**< Pointers */ + LLVMVectorTypeKind = 13, /**< Fixed width SIMD vector type */ + LLVMMetadataTypeKind = 14, /**< Metadata */ + /* 15 previously used by LLVMX86_MMXTypeKind */ + LLVMTokenTypeKind = 16, /**< Tokens */ + LLVMScalableVectorTypeKind = 17, /**< Scalable SIMD vector type */ + LLVMBFloatTypeKind = 18, /**< 16 bit brain floating point type */ + LLVMX86_AMXTypeKind = 19, /**< X86 AMX */ + LLVMTargetExtTypeKind = 20, /**< Target extension type */ } LLVMTypeKind; typedef enum { @@ -1672,11 +1672,6 @@ LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C); */ LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C); -/** - * Create a X86 MMX type in a context. - */ -LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C); - /** * Create a X86 AMX type in a context. */ @@ -1698,7 +1693,6 @@ LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C); */ LLVMTypeRef LLVMVoidType(void); LLVMTypeRef LLVMLabelType(void); -LLVMTypeRef LLVMX86MMXType(void); LLVMTypeRef LLVMX86AMXType(void); /** diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h index d14adfe1590be5..5f7034b5ee36f9 100644 --- a/llvm/include/llvm/IR/DataLayout.h +++ b/llvm/include/llvm/IR/DataLayout.h @@ -693,7 +693,6 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const { case Type::FloatTyID: return TypeSize::getFixed(32); case Type::DoubleTyID: - case Type::X86_MMXTyID: return TypeSize::getFixed(64); case Type::PPC_FP128TyID: case Type::FP128TyID: diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h index 1f0133c08e7d60..c74f9e9d24800f 100644 --- a/llvm/include/llvm/IR/Type.h +++ b/llvm/include/llvm/IR/Type.h @@ -63,7 +63,6 @@ class Type { VoidTyID, ///< type with no size LabelTyID, ///< Labels MetadataTyID, ///< Metadata - X86_MMXTyID, ///< MMX vectors (64 bits, X86 specific) X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific) TokenTyID, ///< Tokens @@ -197,9 +196,6 @@ class Type { const fltSemantics &getFltSemantics() const; - /// Return true if this is X86 MMX. - bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; } - /// Return true if this is X86 AMX. bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; } @@ -285,8 +281,8 @@ class Type { /// Return true if the type is a valid type for a register in codegen. This /// includes all first-class types except struct and array types. bool isSingleValueType() const { - return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() || - isPointerTy() || isVectorTy() || isX86_AMXTy() || isTargetExtTy(); + return isFloatingPointTy() || isIntegerTy() || isPointerTy() || + isVectorTy() || isX86_AMXTy() || isTargetExtTy(); } /// Return true if the type is an aggregate type. This means it is valid as @@ -302,8 +298,7 @@ class Type { bool isSized(SmallPtrSetImpl *Visited = nullptr) const { // If it's a primitive, it is always sized. if (getTypeID() == IntegerTyID || isFloatingPointTy() || - getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID || - getTypeID() == X86_AMXTyID) + getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID) return true; // If it is not something that can have a size (e.g. a function or label), // it doesn't have a size. @@ -453,7 +448,6 @@ class Type { static Type *getX86_FP80Ty(LLVMContext &C); static Type *getFP128Ty(LLVMContext &C); static Type *getPPC_FP128Ty(LLVMContext &C); - static Type *getX86_MMXTy(LLVMContext &C); static Type *getX86_AMXTy(LLVMContext &C); static Type *getTokenTy(LLVMContext &C); static IntegerType *getIntNTy(LLVMContext &C, unsigned N); diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 962880f68f0767..0dbe85631df04e 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -564,16 +564,14 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy, Type *MapTy = Type::getIntNTy(C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedValue()); if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) { - if (Res->isNullValue() && !LoadTy->isX86_MMXTy() && - !LoadTy->isX86_AMXTy()) + if (Res->isNullValue() && !LoadTy->isX86_AMXTy()) // Materializing a zero can be done trivially without a bitcast return Constant::getNullValue(LoadTy); Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy; Res = FoldBitCast(Res, CastTy, DL); if (LoadTy->isPtrOrPtrVectorTy()) { // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr - if (Res->isNullValue() && !LoadTy->isX86_MMXTy() && - !LoadTy->isX86_AMXTy()) + if (Res->isNullValue() && !LoadTy->isX86_AMXTy()) return Constant::getNullValue(LoadTy); if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) // Be careful not to replace a load of an addrspace value with an inttoptr here @@ -764,7 +762,7 @@ Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty, // uniform. if (!DL.typeSizeEqualsStoreSize(C->getType())) return nullptr; - if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) + if (C->isNullValue() && !Ty->isX86_AMXTy()) return Constant::getNullValue(Ty); if (C->isAllOnesValue() && (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy())) diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 7d7fe19568e8a6..c82e74972b67ca 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -838,7 +838,8 @@ lltok::Kind LLLexer::LexIdentifier() { TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context)); TYPEKEYWORD("label", Type::getLabelTy(Context)); TYPEKEYWORD("metadata", Type::getMetadataTy(Context)); - TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context)); + TYPEKEYWORD("x86_mmx", llvm::FixedVectorType::get( + llvm::IntegerType::get(Context, 64), 1)); TYPEKEYWORD("x86_amx", Type::getX86_AMXTy(Context)); TYPEKEYWORD("token", Type::getTokenTy(Context)); TYPEKEYWORD("ptr", PointerType::getUnqual(Context)); diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index f56b2b32ff98f5..7c9bc66a237d53 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2496,7 +2496,9 @@ Error BitcodeReader::parseTypeTableBody() { ResultTy = Type::getMetadataTy(Context); break; case bitc::TYPE_CODE_X86_MMX: // X86_MMX - ResultTy = Type::getX86_MMXTy(Context); + // Deprecated: decodes as <1 x i64> + ResultTy = + llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); break; case bitc::TYPE_CODE_X86_AMX: // X86_AMX ResultTy = Type::getX86_AMXTy(Context); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 3378931065f9b3..216a0cc8e94e3a 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1086,8 +1086,9 @@ void ModuleBitcodeWriter::writeTypeTable() { case Type::FP128TyID: Code = bitc::TYPE_CODE_FP128; break; case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break; case Type::LabelTyID: Code = bitc::TYPE_CODE_LABEL; break; - case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break; - case Type::X86_MMXTyID: Code = bitc::TYPE_CODE_X86_MMX; break; + case Type::MetadataTyID: + Code = bitc::TYPE_CODE_METADATA; + break; case Type::X86_AMXTyID: Code = bitc::TYPE_CODE_X86_AMX; break; case Type::TokenTyID: Code = bitc::TYPE_CODE_TOKEN; break; case Type::IntegerTyID: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 276d980c1dcca9..54f7f127ae663e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5225,6 +5225,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, // Ignore the callsite's attributes. A specific call site may be marked with // readnone, but the lowering code will expect the chain based on the // definition. + const auto &Triple = DAG.getTarget().getTargetTriple(); const Function *F = I.getCalledFunction(); bool HasChain = !F->doesNotAccessMemory(); bool OnlyLoad = HasChain && F->onlyReadsMemory(); @@ -5272,10 +5273,21 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, DAG.getTargetConstantFP(*cast(Arg), SDLoc(), VT)); } } + if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) { + for (SDValue &Op : Ops) { + if (Op.getValueType() == MVT::v1i64) + Op = DAG.getBitcast(MVT::x86mmx, Op); + } + } SmallVector ValueVTs; ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); + if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) { + if (ValueVTs.size() == 1 && ValueVTs[0] == MVT::v1i64) + ValueVTs[0] = MVT::x86mmx; + } + if (HasChain) ValueVTs.push_back(MVT::Other); @@ -5344,6 +5356,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } } + if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) { + if (Result.getValueType() == MVT::x86mmx) + Result = DAG.getBitcast(MVT::v1i64, Result); + } + setValue(&I, Result); } diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index b0f736a49c20ed..0c6b726a28a242 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -207,7 +207,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { assert(isExtended() && "Type is not extended!"); return LLVMTy; case MVT::isVoid: return Type::getVoidTy(Context); - case MVT::x86mmx: return Type::getX86_MMXTy(Context); + case MVT::x86mmx: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); case MVT::aarch64svcount: return TargetExtType::get(Context, "aarch64.svcount"); case MVT::x86amx: return Type::getX86_AMXTy(Context); @@ -241,8 +241,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){ case Type::BFloatTyID: return MVT(MVT::bf16); case Type::FloatTyID: return MVT(MVT::f32); case Type::DoubleTyID: return MVT(MVT::f64); - case Type::X86_FP80TyID: return MVT(MVT::f80); - case Type::X86_MMXTyID: return MVT(MVT::x86mmx); + case Type::X86_FP80TyID: + return MVT(MVT::f80); case Type::TargetExtTyID: { TargetExtType *TargetExtTy = cast(Ty); if (TargetExtTy->getName() == "aarch64.svcount") @@ -302,4 +302,3 @@ void MVT::print(raw_ostream &OS) const { else OS << EVT(*this).getEVTString(); } - diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 6599730590de60..01a16ccd688f43 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -573,8 +573,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) { case Type::FP128TyID: OS << "fp128"; return; case Type::PPC_FP128TyID: OS << "ppc_fp128"; return; case Type::LabelTyID: OS << "label"; return; - case Type::MetadataTyID: OS << "metadata"; return; - case Type::X86_MMXTyID: OS << "x86_mmx"; return; + case Type::MetadataTyID: + OS << "metadata"; + return; case Type::X86_AMXTyID: OS << "x86_amx"; return; case Type::TokenTyID: OS << "token"; return; case Type::IntegerTyID: diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index 693674ae0d06f6..05ab0968ef6f39 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -142,7 +142,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, return UndefValue::get(DestTy); } - if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() && + if (V->isNullValue() && !DestTy->isX86_AMXTy() && opc != Instruction::AddrSpaceCast) return Constant::getNullValue(DestTy); diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 9ba78731060439..b28c3ec56827a9 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -609,8 +609,6 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) { return LLVMPointerTypeKind; case Type::FixedVectorTyID: return LLVMVectorTypeKind; - case Type::X86_MMXTyID: - return LLVMX86_MMXTypeKind; case Type::X86_AMXTyID: return LLVMX86_AMXTypeKind; case Type::TokenTyID: @@ -725,9 +723,6 @@ LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) { LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C)); } -LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) { - return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C)); -} LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C)); } @@ -753,9 +748,6 @@ LLVMTypeRef LLVMFP128Type(void) { LLVMTypeRef LLVMPPCFP128Type(void) { return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext()); } -LLVMTypeRef LLVMX86MMXType(void) { - return LLVMX86MMXTypeInContext(LLVMGetGlobalContext()); -} LLVMTypeRef LLVMX86AMXType(void) { return LLVMX86AMXTypeInContext(LLVMGetGlobalContext()); } diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index 27411653324874..17897f77b4edb8 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -835,7 +835,6 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const { // layout. return Align(PowerOf2Ceil(BitWidth / 8)); } - case Type::X86_MMXTyID: case Type::FixedVectorTyID: case Type::ScalableVectorTyID: { unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinValue(); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 20871982afb062..9b0dd5fca7e0e9 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1052,8 +1052,9 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) { case Type::DoubleTyID: Result += "f64"; break; case Type::X86_FP80TyID: Result += "f80"; break; case Type::FP128TyID: Result += "f128"; break; - case Type::PPC_FP128TyID: Result += "ppcf128"; break; - case Type::X86_MMXTyID: Result += "x86mmx"; break; + case Type::PPC_FP128TyID: + Result += "ppcf128"; + break; case Type::X86_AMXTyID: Result += "x86amx"; break; case Type::IntegerTyID: Result += "i" + utostr(cast(Ty)->getBitWidth()); @@ -1397,7 +1398,8 @@ static Type *DecodeFixedType(ArrayRef &Infos, switch (D.Kind) { case IITDescriptor::Void: return Type::getVoidTy(Context); case IITDescriptor::VarArg: return Type::getVoidTy(Context); - case IITDescriptor::MMX: return Type::getX86_MMXTy(Context); + case IITDescriptor::MMX: + return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1); case IITDescriptor::AMX: return Type::getX86_AMXTy(Context); case IITDescriptor::Token: return Type::getTokenTy(Context); case IITDescriptor::Metadata: return Type::getMetadataTy(Context); @@ -1580,7 +1582,11 @@ static bool matchIntrinsicType( switch (D.Kind) { case IITDescriptor::Void: return !Ty->isVoidTy(); case IITDescriptor::VarArg: return true; - case IITDescriptor::MMX: return !Ty->isX86_MMXTy(); + case IITDescriptor::MMX: { + FixedVectorType *VT = dyn_cast(Ty); + return !VT || VT->getNumElements() != 1 || + !VT->getElementType()->isIntegerTy(64); + } case IITDescriptor::AMX: return !Ty->isX86_AMXTy(); case IITDescriptor::Token: return !Ty->isTokenTy(); case IITDescriptor::Metadata: return !Ty->isMetadataTy(); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 7a8cf8c2304986..58ebe7e95cd06c 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -3116,9 +3116,6 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) { if (SrcBits != DestBits) return false; - if (DestTy->isX86_MMXTy() || SrcTy->isX86_MMXTy()) - return false; - return true; } @@ -3228,12 +3225,6 @@ CastInst::getCastOpcode( return IntToPtr; // int -> ptr } llvm_unreachable("Casting pointer to other than pointer or int"); - } else if (DestTy->isX86_MMXTy()) { - if (SrcTy->isVectorTy()) { - assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX"); - return BitCast; // 64-bit vector to MMX - } - llvm_unreachable("Illegal cast to X86_MMX"); } llvm_unreachable("Casting to type that is not first-class"); } diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp index 0a376179d609cd..4f1ef8cec32133 100644 --- a/llvm/lib/IR/LLVMContextImpl.cpp +++ b/llvm/lib/IR/LLVMContextImpl.cpp @@ -40,9 +40,9 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C) FloatTy(C, Type::FloatTyID), DoubleTy(C, Type::DoubleTyID), MetadataTy(C, Type::MetadataTyID), TokenTy(C, Type::TokenTyID), X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID), - PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID), - X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8), - Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {} + PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_AMXTy(C, Type::X86_AMXTyID), + Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32), + Int64Ty(C, 64), Int128Ty(C, 128) {} LLVMContextImpl::~LLVMContextImpl() { #ifndef NDEBUG diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index 937a87d686175f..8e9ca21d149f65 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -1582,7 +1582,7 @@ class LLVMContextImpl { // Basic type instances. Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy, TokenTy; - Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy; + Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_AMXTy; IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty; std::unique_ptr TheNoneToken; diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 5c61ad9f000b03..18a547e75fe1e4 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -44,8 +44,8 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) { case FP128TyID : return getFP128Ty(C); case PPC_FP128TyID : return getPPC_FP128Ty(C); case LabelTyID : return getLabelTy(C); - case MetadataTyID : return getMetadataTy(C); - case X86_MMXTyID : return getX86_MMXTy(C); + case MetadataTyID: + return getMetadataTy(C); case X86_AMXTyID : return getX86_AMXTy(C); case TokenTyID : return getTokenTy(C); default: @@ -125,14 +125,6 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const { if (isa(this) && isa(Ty)) return getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits(); - // 64-bit fixed width vector types can be losslessly converted to x86mmx. - if (((isa(this)) && Ty->isX86_MMXTy()) && - getPrimitiveSizeInBits().getFixedValue() == 64) - return true; - if ((isX86_MMXTy() && isa(Ty)) && - Ty->getPrimitiveSizeInBits().getFixedValue() == 64) - return true; - // 8192-bit fixed width vector types can be losslessly converted to x86amx. if (((isa(this)) && Ty->isX86_AMXTy()) && getPrimitiveSizeInBits().getFixedValue() == 8192) @@ -179,8 +171,6 @@ TypeSize Type::getPrimitiveSizeInBits() const { return TypeSize::getFixed(128); case Type::PPC_FP128TyID: return TypeSize::getFixed(128); - case Type::X86_MMXTyID: - return TypeSize::getFixed(64); case Type::X86_AMXTyID: return TypeSize::getFixed(8192); case Type::IntegerTyID: @@ -245,7 +235,6 @@ Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; } Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; } Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; } Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; } -Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; } Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; } IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; } diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 3433408f051715..cd0d6d34e9a67b 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1048,9 +1048,6 @@ void DXILBitcodeWriter::writeTypeTable() { case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break; - case Type::X86_MMXTyID: - Code = bitc::TYPE_CODE_X86_MMX; - break; case Type::IntegerTyID: // INTEGER: [width] Code = bitc::TYPE_CODE_INTEGER; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index e5d10a75728bf8..0c1b0aea41f41f 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -329,7 +329,6 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty, case Type::PPC_FP128TyID: case Type::LabelTyID: case Type::MetadataTyID: - case Type::X86_MMXTyID: case Type::X86_AMXTyID: case Type::TokenTyID: case Type::TypedPointerTyID: diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 9ec68bfb8e0f7e..c55ff3dfc9c8e1 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -168,10 +168,6 @@ def CC_#NAME : CallingConv<[ CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64], CCAssignToStack<8, 4>>, - // MMX type gets 8 byte slot in stack , while alignment depends on target - CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>, - CCIfType<[x86mmx], CCAssignToStack<8, 4>>, - // float 128 get stack slots whose size and alignment depends // on the subtarget. CCIfType<[f80, f128], CCAssignToStack<0, 0>>, @@ -286,10 +282,6 @@ def RetCC_X86Common : CallingConv<[ CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, - // MMX vector types are always returned in MM0. If the target doesn't have - // MM0, it doesn't support these vector types. - CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, - // Long double types are always returned in FP0 (even with SSE), // except on Win64. CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>> @@ -376,9 +368,6 @@ def RetCC_X86_64_C : CallingConv<[ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, - // MMX vector types are always returned in XMM0. - CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, - // Pointers are always returned in full 64-bit registers. CCIfPtr>, @@ -389,9 +378,6 @@ def RetCC_X86_64_C : CallingConv<[ // X86-Win64 C return-value convention. def RetCC_X86_Win64_C : CallingConv<[ - // The X86-Win64 calling convention always returns __m64 values in RAX. - CCIfType<[x86mmx], CCBitConvertToType>, - // GCC returns FP values in RAX on Win64. CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, @@ -436,8 +422,6 @@ def RetCC_X86_64_Swift : CallingConv<[ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, - // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3. - CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, CCDelegateTo ]>; @@ -572,12 +556,6 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, - // The first 8 MMX vector arguments are passed in XMM registers on Darwin. - CCIfType<[x86mmx], - CCIfSubtarget<"isTargetDarwin()", - CCIfSubtarget<"hasSSE2()", - CCPromoteToType>>>, - // Boolean vectors of AVX-512 are passed in SIMD registers. // The call from AVX to AVX-512 function should work, // since the boolean types in AVX/AVX2 are promoted by default. @@ -666,9 +644,6 @@ def CC_X86_Win64_C : CallingConv<[ // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, - // The first 4 MMX vector arguments are passed in GPRs. - CCIfType<[x86mmx], CCBitConvertToType>, - // If SSE was disabled, pass FP values smaller than 64-bits as integers in // GPRs or on the stack. CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, @@ -843,11 +818,6 @@ def CC_X86_32_Common : CallingConv<[ CCIfNotVarArg>>>, - // The first 3 __m64 vector arguments are passed in mmx registers if the - // call is not a vararg call. - CCIfNotVarArg>>, - CCIfType<[f16], CCAssignToStack<4, 4>>, // Integer/Float values get stored in stack slots that are 4 bytes in @@ -870,10 +840,6 @@ def CC_X86_32_Common : CallingConv<[ CCIfType<[v32i1], CCPromoteToType>, CCIfType<[v64i1], CCPromoteToType>, - // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are - // passed in the parameter area. - CCIfType<[x86mmx], CCAssignToStack<8, 4>>, - // Darwin passes vectors in a form that differs from the i386 psABI CCIfSubtarget<"isTargetDarwin()", CCDelegateTo>, diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 322cb6f6f5819b..793d62ba2a8e79 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -623,11 +623,13 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II, if (isa(Arg)) return Constant::getNullValue(ResTy); - auto *ArgTy = dyn_cast(Arg->getType()); - // We can't easily peek through x86_mmx types. - if (!ArgTy) + // Preserve previous behavior and give up. + // TODO: treat as <8 x i8>. + if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) return nullptr; + auto *ArgTy = cast(Arg->getType()); + // Expand MOVMSK to compare/bitcast/zext: // e.g. PMOVMSKB(v16i8 x): // %cmp = icmp slt <16 x i8> %x, zeroinitializer diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index a164c82bdf75d4..f5a7ab26a49e96 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -962,8 +962,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { } else if (NewCB->getType()->isVoidTy()) { // If the return value is dead, replace any uses of it with poison // (any non-debug value uses will get removed later on). - if (!CB.getType()->isX86_MMXTy()) - CB.replaceAllUsesWith(PoisonValue::get(CB.getType())); + CB.replaceAllUsesWith(PoisonValue::get(CB.getType())); } else { assert((RetTy->isStructTy() || RetTy->isArrayTy()) && "Return type changed, but not into a void. The old return type" @@ -1027,8 +1026,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { } else { // If this argument is dead, replace any uses of it with poison // (any non-debug value uses will get removed later on). - if (!I->getType()->isX86_MMXTy()) - I->replaceAllUsesWith(PoisonValue::get(I->getType())); + I->replaceAllUsesWith(PoisonValue::get(I->getType())); } // If we change the return value of the function we must rewrite any return diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 7b1268939e9c4b..7fcbeb35de71df 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2677,13 +2677,6 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { return replaceInstUsesWith(CI, Src); if (FixedVectorType *DestVTy = dyn_cast(DestTy)) { - // Beware: messing with this target-specific oddity may cause trouble. - if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) { - Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType()); - return InsertElementInst::Create(PoisonValue::get(DestTy), Elem, - Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); - } - if (isa(SrcTy)) { // If this is a cast from an integer to vector, check to see if the input // is a trunc or zext of a bitcast from vector. If so, we can replace all diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index c7d41f6298372d..07f84479bf74a2 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2977,8 +2977,7 @@ struct MemorySanitizerVisitor : public InstVisitor { /// Caller guarantees that this intrinsic does not access memory. bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) { Type *RetTy = I.getType(); - if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy() || - RetTy->isX86_MMXTy())) + if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy())) return false; unsigned NumArgOperands = I.arg_size(); @@ -3208,7 +3207,7 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } - // Get an X86_MMX-sized vector type. + // Get an MMX-sized vector type. Type *getMMXVectorTy(unsigned EltSizeInBits) { const unsigned X86_MMXSizeInBits = 64; assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 && @@ -3254,20 +3253,21 @@ struct MemorySanitizerVisitor : public InstVisitor { // packs elements of 2 input vectors into half as many bits with saturation. // Shadow is propagated with the signed variant of the same intrinsic applied // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer). - // EltSizeInBits is used only for x86mmx arguments. - void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) { + // MMXEltSizeInBits is used only for x86mmx arguments. + void handleVectorPackIntrinsic(IntrinsicInst &I, + unsigned MMXEltSizeInBits = 0) { assert(I.arg_size() == 2); - bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); IRBuilder<> IRB(&I); Value *S1 = getShadow(&I, 0); Value *S2 = getShadow(&I, 1); - assert(isX86_MMX || S1->getType()->isVectorTy()); + assert(S1->getType()->isVectorTy()); // SExt and ICmpNE below must apply to individual elements of input vectors. // In case of x86mmx arguments, cast them to appropriate vector types and // back. - Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType(); - if (isX86_MMX) { + Type *T = + MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits) : S1->getType(); + if (MMXEltSizeInBits) { S1 = IRB.CreateBitCast(S1, T); S2 = IRB.CreateBitCast(S2, T); } @@ -3275,10 +3275,9 @@ struct MemorySanitizerVisitor : public InstVisitor { IRB.CreateSExt(IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T); Value *S2_ext = IRB.CreateSExt(IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T); - if (isX86_MMX) { - Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C); - S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy); - S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy); + if (MMXEltSizeInBits) { + S1_ext = IRB.CreateBitCast(S1_ext, getMMXVectorTy(64)); + S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64)); } Function *ShadowFn = Intrinsic::getDeclaration( @@ -3286,7 +3285,7 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *S = IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack"); - if (isX86_MMX) + if (MMXEltSizeInBits) S = IRB.CreateBitCast(S, getShadowTy(&I)); setShadow(&I, S); setOriginForNaryOp(I); @@ -3393,10 +3392,9 @@ struct MemorySanitizerVisitor : public InstVisitor { } // Instrument sum-of-absolute-differences intrinsic. - void handleVectorSadIntrinsic(IntrinsicInst &I) { + void handleVectorSadIntrinsic(IntrinsicInst &I, bool IsMMX = false) { const unsigned SignificantBitsPerResultElement = 16; - bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); - Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType(); + Type *ResTy = IsMMX ? IntegerType::get(*MS.C, 64) : I.getType(); unsigned ZeroBitsPerResultElement = ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement; @@ -3415,9 +3413,9 @@ struct MemorySanitizerVisitor : public InstVisitor { // Instrument multiply-add intrinsic. void handleVectorPmaddIntrinsic(IntrinsicInst &I, - unsigned EltSizeInBits = 0) { - bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy(); - Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType(); + unsigned MMXEltSizeInBits = 0) { + Type *ResTy = + MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType(); IRBuilder<> IRB(&I); auto *Shadow0 = getShadow(&I, 0); auto *Shadow1 = getShadow(&I, 1); @@ -4088,6 +4086,8 @@ struct MemorySanitizerVisitor : public InstVisitor { break; case Intrinsic::x86_mmx_psad_bw: + handleVectorSadIntrinsic(I, true); + break; case Intrinsic::x86_sse2_psad_bw: case Intrinsic::x86_avx2_psad_bw: handleVectorSadIntrinsic(I); @@ -4968,7 +4968,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase { Type *T = arg->getType(); if (T->isX86_FP80Ty()) return AK_Memory; - if (T->isFPOrFPVectorTy() || T->isX86_MMXTy()) + if (T->isFPOrFPVectorTy()) return AK_FloatingPoint; if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64) return AK_GeneralPurpose; diff --git a/llvm/test/Assembler/x86mmx.ll b/llvm/test/Assembler/x86mmx.ll deleted file mode 100644 index 608347e0fceb10..00000000000000 --- a/llvm/test/Assembler/x86mmx.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llvm-as < %s | llvm-dis | FileCheck %s -; RUN: verify-uselistorder %s -; Basic smoke test for x86_mmx type. - -; CHECK: define x86_mmx @sh16 -define x86_mmx @sh16(x86_mmx %A) { -; CHECK: ret x86_mmx %A - ret x86_mmx %A -} diff --git a/llvm/test/Bitcode/bcanalyzer-types.ll b/llvm/test/Bitcode/bcanalyzer-types.ll index cbe6f5d22c9479..f1732db174c295 100644 --- a/llvm/test/Bitcode/bcanalyzer-types.ll +++ b/llvm/test/Bitcode/bcanalyzer-types.ll @@ -3,7 +3,6 @@ ; CHECK: Block ID {{.*}} (TYPE_BLOCK_ID) ; CHECK: BFLOAT ; CHECK: TOKEN -; CHECK: X86_MMX ; CHECK: HALF ; CHECK: Block ID @@ -12,11 +11,6 @@ define half @test_half(half %x, half %y) { ret half %a } -define x86_mmx @test_mmx(<2 x i32> %x) { - %a = bitcast <2 x i32> %x to x86_mmx - ret x86_mmx %a -} - define bfloat @test_bfloat(i16 %x) { %a = bitcast i16 %x to bfloat ret bfloat %a diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll index 2190e2fbccf288..37a87eea41ad36 100644 --- a/llvm/test/Bitcode/compatibility-3.6.ll +++ b/llvm/test/Bitcode/compatibility-3.6.ll @@ -645,7 +645,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll index 7e59b5c1be6e2f..8de2132d7ec892 100644 --- a/llvm/test/Bitcode/compatibility-3.7.ll +++ b/llvm/test/Bitcode/compatibility-3.7.ll @@ -689,7 +689,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll index ebd1f2fff8c94c..7f766aa34a005f 100644 --- a/llvm/test/Bitcode/compatibility-3.8.ll +++ b/llvm/test/Bitcode/compatibility-3.8.ll @@ -742,7 +742,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll index c34f04ceb0de39..c8309175e063f0 100644 --- a/llvm/test/Bitcode/compatibility-3.9.ll +++ b/llvm/test/Bitcode/compatibility-3.9.ll @@ -813,7 +813,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll index 05bffda1d117a3..adbd91ac6c7fe5 100644 --- a/llvm/test/Bitcode/compatibility-4.0.ll +++ b/llvm/test/Bitcode/compatibility-4.0.ll @@ -813,7 +813,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll index 0c872289c62ba8..1b500da69568af 100644 --- a/llvm/test/Bitcode/compatibility-5.0.ll +++ b/llvm/test/Bitcode/compatibility-5.0.ll @@ -820,7 +820,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll index 44c680885be34f..c1abbf0cda6eb9 100644 --- a/llvm/test/Bitcode/compatibility-6.0.ll +++ b/llvm/test/Bitcode/compatibility-6.0.ll @@ -830,7 +830,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca %opaquety* ; CHECK: %t8 = alloca ptr diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index e437c37d8d1c87..a7567038b7a7ba 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -1113,7 +1113,7 @@ define void @typesystem() { %t6 = alloca ppc_fp128 ; CHECK: %t6 = alloca ppc_fp128 %t7 = alloca x86_mmx - ; CHECK: %t7 = alloca x86_mmx + ; CHECK: %t7 = alloca <1 x i64> %t8 = alloca ptr ; CHECK: %t8 = alloca ptr %t9 = alloca <4 x i32> diff --git a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll index ac86279ca6667e..3a112ae2a2113e 100644 --- a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll +++ b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll @@ -29,7 +29,17 @@ entry: define <2 x double> @a2(x86_mmx %x) nounwind { ; CHECK-LABEL: a2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvtpi2pd %mm0, %xmm0 +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: movl 8(%ebp), %eax +; CHECK-NEXT: movl 12(%ebp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: cvtpi2pd (%esp), %xmm0 +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl entry: %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x) @@ -39,7 +49,16 @@ entry: define x86_mmx @b2(<2 x double> %x) nounwind { ; CHECK-LABEL: b2: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $8, %esp ; CHECK-NEXT: cvttpd2pi %xmm0, %mm0 +; CHECK-NEXT: movq %mm0, (%esp) +; CHECK-NEXT: movl (%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl entry: %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x) diff --git a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll b/llvm/test/CodeGen/X86/3dnow-intrinsics.ll index a82f705b77d848..73870d57cb79a9 100644 --- a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll +++ b/llvm/test/CodeGen/X86/3dnow-intrinsics.ll @@ -5,15 +5,32 @@ define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone { ; X86-LABEL: test_pavgusb: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pavgusb %mm1, %mm0 +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl 20(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movq (%esp), %mm0 +; X86-NEXT: pavgusb {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test_pavgusb: ; X64: # %bb.0: # %entry -; X64-NEXT: pavgusb %mm1, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pavgusb %mm0, %mm1 +; X64-NEXT: movq2dq %mm1, %xmm0 ; X64-NEXT: retq entry: %0 = bitcast x86_mmx %a.coerce to <8 x i8> @@ -638,8 +655,12 @@ define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: pi2fd %mm0, %mm0 +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: pi2fd {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) @@ -649,6 +670,7 @@ define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone { ; ; X64-LABEL: test_pi2fd: ; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: pi2fd %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 @@ -666,15 +688,32 @@ declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone define <4 x i16> @test_pmulhrw(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone { ; X86-LABEL: test_pmulhrw: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: pmulhrw %mm1, %mm0 +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl 20(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movq (%esp), %mm0 +; X86-NEXT: pmulhrw {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test_pmulhrw: ; X64: # %bb.0: # %entry -; X64-NEXT: pmulhrw %mm1, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pmulhrw %mm0, %mm1 +; X64-NEXT: movq2dq %mm1, %xmm0 ; X64-NEXT: retq entry: %0 = bitcast x86_mmx %a.coerce to <4 x i16> @@ -805,8 +844,12 @@ define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: pi2fw %mm0, %mm0 +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: pi2fw {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: flds (%esp) @@ -816,6 +859,7 @@ define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone { ; ; X64-LABEL: test_pi2fw: ; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %mm0 ; X64-NEXT: pi2fw %mm0, %mm0 ; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index b442a6337e3b89..3f6f8c01b9049f 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -1014,17 +1014,13 @@ define float @broadcast_lifetime() nounwind { define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind { ; X86-LABEL: broadcast_x86_mmx: ; X86: ## %bb.0: ## %bb -; X86-NEXT: subl $12, %esp -; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-LABEL: broadcast_x86_mmx: ; X64: ## %bb.0: ## %bb -; X64-NEXT: movdq2q %xmm0, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: vmovq %rdi, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: retq bb: %tmp1 = bitcast x86_mmx %tmp to i64 diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll index b7516d30df5f67..fed6c2eb8ba0a2 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1452,25 +1452,18 @@ eintry: define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind { ; X86-LABEL: broadcast_x86_mmx: ; X86: ## %bb.0: ## %bb -; X86-NEXT: subl $12, %esp -; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-AVX2-LABEL: broadcast_x86_mmx: ; X64-AVX2: ## %bb.0: ## %bb -; X64-AVX2-NEXT: movdq2q %xmm0, %mm0 -; X64-AVX2-NEXT: movq %mm0, %rax -; X64-AVX2-NEXT: vmovq %rax, %xmm0 +; X64-AVX2-NEXT: vmovq %rdi, %xmm0 ; X64-AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512VL-LABEL: broadcast_x86_mmx: ; X64-AVX512VL: ## %bb.0: ## %bb -; X64-AVX512VL-NEXT: movdq2q %xmm0, %mm0 -; X64-AVX512VL-NEXT: movq %mm0, %rax -; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm0 +; X64-AVX512VL-NEXT: vpbroadcastq %rdi, %xmm0 ; X64-AVX512VL-NEXT: retq bb: %tmp1 = bitcast x86_mmx %tmp to i64 diff --git a/llvm/test/CodeGen/X86/fast-isel-bc.ll b/llvm/test/CodeGen/X86/fast-isel-bc.ll index 0fbc9fab056814..e3bb5e7176e57b 100644 --- a/llvm/test/CodeGen/X86/fast-isel-bc.ll +++ b/llvm/test/CodeGen/X86/fast-isel-bc.ll @@ -12,7 +12,11 @@ define void @func1() nounwind { ; X86-LABEL: func1: ; X86: ## %bb.0: ; X86-NEXT: subl $12, %esp -; X86-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 ## mm0 = 0x200000000 +; X86-NEXT: movl $2, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movl %esp, %eax +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: calll _func2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: retl @@ -20,8 +24,7 @@ define void @func1() nounwind { ; X64-LABEL: func1: ; X64: ## %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 ## mm0 = 0x200000000 -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movabsq $8589934592, %rdi ## imm = 0x200000000 ; X64-NEXT: callq _func2 ; X64-NEXT: popq %rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll index c13fdae540d0b8..fd9f4fa63a090e 100644 --- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll +++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll @@ -94,6 +94,7 @@ entry: ; ; MMX Store +; Note: doesn't actually emit a non-temporal store here. ; define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) { @@ -101,7 +102,7 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) { ; ALL: # %bb.0: # %entry ; ALL-NEXT: movq (%rdi), %mm0 ; ALL-NEXT: psrlq $3, %mm0 -; ALL-NEXT: movntq %mm0, (%rsi) +; ALL-NEXT: movq %mm0, (%rsi) ; ALL-NEXT: retq entry: %0 = load x86_mmx, ptr %a0 diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll index a4dbb10e0d7a04..54f048eb697f6c 100644 --- a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll +++ b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll @@ -10,8 +10,8 @@ define void @t3() nounwind { ; X86-64-LABEL: t3: ; X86-64: ## %bb.0: ; X86-64-NEXT: movq _g_v8qi@GOTPCREL(%rip), %rax -; X86-64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-64-NEXT: movb $1, %al +; X86-64-NEXT: movq (%rax), %rdi +; X86-64-NEXT: xorl %eax, %eax ; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL %tmp3 = load <8 x i8>, ptr @g_v8qi, align 8 %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx @@ -22,12 +22,11 @@ define void @t3() nounwind { define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind { ; X86-64-LABEL: t4: ; X86-64: ## %bb.0: -; X86-64-NEXT: movdq2q %xmm1, %mm0 -; X86-64-NEXT: movdq2q %xmm0, %mm1 -; X86-64-NEXT: movq2dq %mm1, %xmm1 -; X86-64-NEXT: movq2dq %mm0, %xmm0 -; X86-64-NEXT: paddb %xmm1, %xmm0 -; X86-64-NEXT: movb $1, %al +; X86-64-NEXT: movq %rdi, %xmm0 +; X86-64-NEXT: movq %rsi, %xmm1 +; X86-64-NEXT: paddb %xmm0, %xmm1 +; X86-64-NEXT: movq %xmm1, %rdi +; X86-64-NEXT: xorl %eax, %eax ; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL %v1a = bitcast x86_mmx %v1 to <8 x i8> %v2b = bitcast x86_mmx %v2 to <8 x i8> diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing.ll b/llvm/test/CodeGen/X86/mmx-arg-passing.ll index af116a2ac281b3..1ae9920873fafc 100644 --- a/llvm/test/CodeGen/X86/mmx-arg-passing.ll +++ b/llvm/test/CodeGen/X86/mmx-arg-passing.ll @@ -13,15 +13,17 @@ define void @t1(x86_mmx %v1) nounwind { ; X86-32-LABEL: t1: ; X86-32: ## %bb.0: -; X86-32-NEXT: movl L_u1$non_lazy_ptr, %eax -; X86-32-NEXT: movq %mm0, (%eax) +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-32-NEXT: movl L_u1$non_lazy_ptr, %edx +; X86-32-NEXT: movl %ecx, 4(%edx) +; X86-32-NEXT: movl %eax, (%edx) ; X86-32-NEXT: retl ; ; X86-64-LABEL: t1: ; X86-64: ## %bb.0: -; X86-64-NEXT: movdq2q %xmm0, %mm0 ; X86-64-NEXT: movq _u1@GOTPCREL(%rip), %rax -; X86-64-NEXT: movq %mm0, (%rax) +; X86-64-NEXT: movq %rdi, (%rax) ; X86-64-NEXT: retq store x86_mmx %v1, ptr @u1, align 8 ret void diff --git a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll index 0fa7b24ff445aa..a1240911cd36a1 100644 --- a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll +++ b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll @@ -1,6 +1,6 @@ ; RUN: opt -mtriple=x86_64-- -passes=early-cse -earlycse-debug-hash < %s -S | FileCheck %s -; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx)) +; CHECK: @foo(<1 x i64> zeroinitializer) define void @bar() { entry: diff --git a/llvm/test/CodeGen/X86/mmx-bitcast.ll b/llvm/test/CodeGen/X86/mmx-bitcast.ll index f914b8622fcf4b..49c2027f06604e 100644 --- a/llvm/test/CodeGen/X86/mmx-bitcast.ll +++ b/llvm/test/CodeGen/X86/mmx-bitcast.ll @@ -58,8 +58,8 @@ define i64 @t3(ptr %p) { define void @t4(<1 x i64> %A, <1 x i64> %B) { ; CHECK-LABEL: t4: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: movq %rdi, %mm0 -; CHECK-NEXT: movq %rsi, %mm1 +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 ; CHECK-NEXT: paddusw %mm0, %mm1 ; CHECK-NEXT: movq _R@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq %mm1, (%rax) diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll index 73df6be8d79890..a31339902bb645 100644 --- a/llvm/test/CodeGen/X86/mmx-fold-load.ll +++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll @@ -288,8 +288,13 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddb (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -301,7 +306,8 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt0: ; X64: # %bb.0: # %entry -; X64-NEXT: paddb (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddb (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -321,8 +327,13 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddw (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -334,7 +345,8 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt1: ; X64: # %bb.0: # %entry -; X64-NEXT: paddw (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddw (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -353,8 +365,13 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddd (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -366,7 +383,8 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt2: ; X64: # %bb.0: # %entry -; X64-NEXT: paddd (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddd (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -385,8 +403,13 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddq (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -398,7 +421,8 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt3: ; X64: # %bb.0: # %entry -; X64-NEXT: paddq (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddq (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -417,8 +441,13 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddusb (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -430,7 +459,8 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt4: ; X64: # %bb.0: # %entry -; X64-NEXT: paddusb (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddusb (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -449,8 +479,13 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: paddusw (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -462,7 +497,8 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt5: ; X64: # %bb.0: # %entry -; X64-NEXT: paddusw (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: paddusw (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -481,8 +517,13 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psrlw (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -494,7 +535,8 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt6: ; X64: # %bb.0: # %entry -; X64-NEXT: psrlw (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: psrlw (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -513,8 +555,13 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psrld (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -526,7 +573,8 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt7: ; X64: # %bb.0: # %entry -; X64-NEXT: psrld (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: psrld (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -545,8 +593,13 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind { ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psrlq (%eax), %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -558,7 +611,8 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind { ; ; X64-LABEL: tt8: ; X64: # %bb.0: # %entry -; X64-NEXT: psrlq (%rdi), %mm0 +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: psrlq (%rsi), %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: emms ; X64-NEXT: retq @@ -612,16 +666,29 @@ declare void @llvm.lifetime.end(i64, ptr nocapture) define x86_mmx @vec_load(ptr %x) { ; X86-LABEL: vec_load: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1] ; X86-NEXT: paddsb %mm0, %mm0 +; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: movl (%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl ; ; X64-LABEL: vec_load: ; X64: # %bb.0: ; X64-NEXT: pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1] ; X64-NEXT: paddsb %mm0, %mm0 -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq %z = load <4 x float>, ptr %x %y = extractelement <4 x float> %z, i32 0 diff --git a/llvm/test/CodeGen/X86/mmx-intrinsics.ll b/llvm/test/CodeGen/X86/mmx-intrinsics.ll index a43d9400cde6c8..69fc6361075449 100644 --- a/llvm/test/CodeGen/X86/mmx-intrinsics.ll +++ b/llvm/test/CodeGen/X86/mmx-intrinsics.ll @@ -32,10 +32,10 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phaddw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phaddw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -77,10 +77,10 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test88: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpgtd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpgtd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -122,10 +122,10 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test87: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpgtw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpgtw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -167,10 +167,10 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test86: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpgtb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpgtb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -212,10 +212,10 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test85: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpeqd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpeqd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -257,10 +257,10 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test84: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpeqw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpeqw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -302,10 +302,10 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test83: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pcmpeqb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pcmpeqb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -347,10 +347,10 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test82: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -392,10 +392,10 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test81: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -437,10 +437,10 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test80: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -482,10 +482,10 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test79: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -527,10 +527,10 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test78: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -572,10 +572,10 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test77: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7] +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -617,10 +617,10 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test76: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: packuswb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: packuswb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -662,10 +662,10 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test75: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: packssdw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: packssdw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -707,10 +707,10 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test74: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: packsswb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: packsswb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -1459,8 +1459,8 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test56: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pxor %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -1504,8 +1504,8 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test55: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: por %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -1549,10 +1549,10 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test54: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pandn %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pandn %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -1594,8 +1594,8 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test53: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pand %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -1639,8 +1639,8 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test52: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmullw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -1682,8 +1682,8 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test51: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmullw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -1727,8 +1727,8 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test50: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmulhw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -1772,8 +1772,8 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test49: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmaddwd %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -1817,10 +1817,10 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test48: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubusw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubusw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -1862,10 +1862,10 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test47: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubusb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubusb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -1907,10 +1907,10 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test46: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -1952,10 +1952,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test45: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubsb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubsb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -2032,10 +2032,10 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test43: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -2077,10 +2077,10 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test42: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -2122,10 +2122,10 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test41: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psubb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psubb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -2167,8 +2167,8 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test40: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddusw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2212,8 +2212,8 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test39: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddusb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2257,8 +2257,8 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test38: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2302,8 +2302,8 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test37: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddsb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2382,8 +2382,8 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test35: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddd %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2427,8 +2427,8 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test34: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2472,8 +2472,8 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test33: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: paddb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2517,8 +2517,8 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test32: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: psadbw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2560,8 +2560,8 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test31: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pminsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2605,8 +2605,8 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test30: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pminub %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2650,8 +2650,8 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test29: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmaxsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2695,8 +2695,8 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test28: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmaxub %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2740,8 +2740,8 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test27: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pavgw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2785,8 +2785,8 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test26: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pavgb %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -2884,10 +2884,10 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp { ; ; X64-LABEL: test23: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: maskmovq %mm1, %mm0 +; X64-NEXT: maskmovq %mm0, %mm1 ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %n to <8 x i8> @@ -2926,8 +2926,8 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test22: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmulhuw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -3041,8 +3041,8 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test20: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmuludq %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -3320,10 +3320,10 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test12: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psignd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psignd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -3365,10 +3365,10 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test11: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psignw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psignw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -3410,10 +3410,10 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test10: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: psignb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: psignb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -3455,10 +3455,10 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test9: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pshufb %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pshufb %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -3500,8 +3500,8 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test8: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 ; X64-NEXT: pmulhrsw %mm0, %mm1 ; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq @@ -3545,10 +3545,10 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test7: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: pmaddubsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: pmaddubsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <8 x i8> @@ -3590,10 +3590,10 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test6: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phsubsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phsubsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -3635,10 +3635,10 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test5: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phsubd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phsubd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -3680,10 +3680,10 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test4: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phsubw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phsubw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -3725,10 +3725,10 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phaddsw %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phaddsw %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <4 x i16> @@ -3770,10 +3770,10 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp { ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rdi, %mm0 -; X64-NEXT: movq %rsi, %mm1 -; X64-NEXT: phaddd %mm1, %mm0 -; X64-NEXT: movq %mm0, %rax +; X64-NEXT: movq %rsi, %mm0 +; X64-NEXT: movq %rdi, %mm1 +; X64-NEXT: phaddd %mm0, %mm1 +; X64-NEXT: movq %mm1, %rax ; X64-NEXT: retq entry: %0 = bitcast <1 x i64> %b to <2 x i32> @@ -3788,10 +3788,26 @@ entry: } define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind { -; ALL-LABEL: test89: -; ALL: # %bb.0: -; ALL-NEXT: cvtpi2ps %mm0, %xmm0 -; ALL-NEXT: ret{{[l|q]}} +; X86-LABEL: test89: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: cvtpi2ps (%esp), %xmm0 +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test89: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %mm0 +; X64-NEXT: cvtpi2ps %mm0, %xmm0 +; X64-NEXT: retq %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b) ret <4 x float> %c } diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll index 45587b8c69cd40..cd0ece12a19167 100644 --- a/llvm/test/CodeGen/X86/pr23246.ll +++ b/llvm/test/CodeGen/X86/pr23246.ll @@ -9,7 +9,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define <2 x i64> @test(x86_mmx %a) #0 { ; CHECK-LABEL: test: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %rdi, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll index 27b7ebb8381cd3..8339cb71d46718 100644 --- a/llvm/test/CodeGen/X86/select-mmx.ll +++ b/llvm/test/CodeGen/X86/select-mmx.ll @@ -14,15 +14,11 @@ define i64 @test47(i64 %arg) { ; ; X64-LABEL: test47: ; X64: # %bb.0: +; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB0_1 -; X64-NEXT: # %bb.2: -; X64-NEXT: pxor %mm0, %mm0 -; X64-NEXT: jmp .LBB0_3 -; X64-NEXT: .LBB0_1: -; X64-NEXT: movl $7, %eax -; X64-NEXT: movd %eax, %mm0 -; X64-NEXT: .LBB0_3: +; X64-NEXT: movl $7, %ecx +; X64-NEXT: cmovneq %rax, %rcx +; X64-NEXT: movq %rcx, %mm0 ; X64-NEXT: psllw %mm0, %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq @@ -35,17 +31,17 @@ define i64 @test47(i64 %arg) { ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: orl 12(%ebp), %eax -; X86-NEXT: je .LBB0_1 -; X86-NEXT: # %bb.2: -; X86-NEXT: pxor %mm0, %mm0 -; X86-NEXT: jmp .LBB0_3 -; X86-NEXT: .LBB0_1: ; X86-NEXT: movl $7, %eax -; X86-NEXT: movd %eax, %mm0 -; X86-NEXT: .LBB0_3: +; X86-NEXT: je .LBB0_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB0_2: +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: psllw %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %eax @@ -74,13 +70,8 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) { ; X64-LABEL: test49: ; X64: # %bb.0: ; X64-NEXT: testq %rdi, %rdi -; X64-NEXT: je .LBB1_1 -; X64-NEXT: # %bb.2: -; X64-NEXT: movq %rdx, %mm0 -; X64-NEXT: jmp .LBB1_3 -; X64-NEXT: .LBB1_1: +; X64-NEXT: cmovneq %rdx, %rsi ; X64-NEXT: movq %rsi, %mm0 -; X64-NEXT: .LBB1_3: ; X64-NEXT: psllw %mm0, %mm0 ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll b/llvm/test/CodeGen/X86/stack-folding-3dnow.ll index 1cbd61567f3270..d4821c5fa3d416 100644 --- a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll +++ b/llvm/test/CodeGen/X86/stack-folding-3dnow.ll @@ -4,12 +4,13 @@ define x86_mmx @stack_fold_pavgusb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pavgusb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pavgusb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pavgusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -20,12 +21,13 @@ declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pf2id(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pf2id: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pf2id {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %a) nounwind readnone @@ -36,12 +38,13 @@ declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pf2iw(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pf2iw: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pf2iw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %a) nounwind readnone @@ -52,12 +55,13 @@ declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfacc(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfacc: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfacc %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -68,12 +72,13 @@ declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfadd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfadd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfadd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfadd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -84,12 +89,13 @@ declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfcmpeq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfcmpeq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfcmpeq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfcmpeq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -100,12 +106,13 @@ declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfcmpge(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfcmpge: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfcmpge %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfcmpge {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -116,12 +123,13 @@ declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfcmpgt(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfcmpgt: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfcmpgt %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfcmpgt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -132,12 +140,13 @@ declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfmax(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfmax: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfmax %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfmax {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -148,12 +157,13 @@ declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfmin(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfmin: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfmin %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfmin {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -164,12 +174,13 @@ declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfmul(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfmul: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfmul %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfmul {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -180,12 +191,13 @@ declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfnacc(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfnacc: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfnacc %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -196,12 +208,13 @@ declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfpnacc(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfpnacc: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfpnacc %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfpnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -212,12 +225,13 @@ declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfrcp(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pfrcp: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pfrcp {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %a) nounwind readnone @@ -228,12 +242,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfrcpit1(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfrcpit1: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfrcpit1 %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrcpit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -244,12 +259,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfrcpit2(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfrcpit2: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfrcpit2 %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrcpit2 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -260,12 +276,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfrsqit1(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfrsqit1: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfrsqit1 %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfrsqit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -276,12 +293,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfrsqrt(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pfrsqrt: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pfrsqrt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %a) nounwind readnone @@ -292,12 +310,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfsub(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfsub: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfsub %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfsub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -308,12 +327,13 @@ declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pfsubr(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pfsubr: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pfsubr %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pfsubr {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -324,12 +344,13 @@ declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pi2fd(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pi2fd: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pi2fd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %a) nounwind readnone @@ -340,12 +361,13 @@ declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pi2fw(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pi2fw: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pi2fw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %a) nounwind readnone @@ -356,12 +378,13 @@ declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmulhrw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmulhrw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmulhrw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhrw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -372,13 +395,14 @@ declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pswapd(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pswapd: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pswapd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload ; CHECK-NEXT: # mm0 = mem[1,0] -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %a) nounwind readnone diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll index 11ca9e2a547eef..6652a8ca0dbd54 100644 --- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll +++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll @@ -9,7 +9,7 @@ define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvtpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone @@ -20,6 +20,7 @@ declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) { ; CHECK-LABEL: stack_fold_cvtpi2pd: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -35,6 +36,7 @@ declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) { ; CHECK-LABEL: stack_fold_cvtpi2ps: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -55,7 +57,7 @@ define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvtps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone @@ -71,7 +73,7 @@ define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvttpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone @@ -87,7 +89,7 @@ define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) { ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: cvttps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone @@ -107,6 +109,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind { ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: paddb %mm0, %mm0 ; CHECK-NEXT: movd %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: #APP @@ -139,6 +142,7 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind { ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: paddb %mm0, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP @@ -161,12 +165,13 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind { define x86_mmx @stack_fold_pabsb(x86_mmx %a0) { ; CHECK-LABEL: stack_fold_pabsb: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone @@ -177,12 +182,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pabsd(x86_mmx %a0) { ; CHECK-LABEL: stack_fold_pabsd: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone @@ -193,12 +199,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone define x86_mmx @stack_fold_pabsw(x86_mmx %a0) { ; CHECK-LABEL: stack_fold_pabsw: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone @@ -209,12 +216,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_packssdw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: packssdw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -225,12 +233,13 @@ declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_packsswb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: packsswb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: packsswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -241,12 +250,13 @@ declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_packuswb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: packuswb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -257,12 +267,13 @@ declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -273,12 +284,13 @@ declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -289,12 +301,13 @@ declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -305,12 +318,13 @@ declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddsb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddsb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -321,12 +335,13 @@ declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -337,12 +352,13 @@ declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddusb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddusb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -353,12 +369,13 @@ declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddusw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddusw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -369,12 +386,13 @@ declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_paddw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: paddw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: paddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -385,12 +403,13 @@ declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_palignr: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: palignr $1, %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone @@ -401,12 +420,13 @@ declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pand: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pand %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pand {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -417,12 +437,13 @@ declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pandn: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pandn %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -433,12 +454,13 @@ declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pavgb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pavgb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pavgb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -449,12 +471,13 @@ declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pavgw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pavgw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pavgw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -465,12 +488,13 @@ declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pcmpeqb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpeqb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -481,12 +505,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pcmpeqd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpeqd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -497,12 +522,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pcmpeqw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpeqw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -513,12 +539,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pcmpgtb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpgtb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -529,12 +556,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pcmpgtd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpgtd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -545,12 +573,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pcmpgtw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pcmpgtw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -561,12 +590,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_phaddd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phaddd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phaddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -577,12 +607,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_phaddsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phaddsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -593,12 +624,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_phaddw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phaddw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phaddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -609,12 +641,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_phsubd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phsubd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phsubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -625,12 +658,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_phsubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phsubsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -641,12 +675,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_phsubw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: phsubw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: phsubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -659,12 +694,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmaddubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaddubsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -675,12 +711,13 @@ declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmaddwd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaddwd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -691,12 +728,13 @@ declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmaxsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaxsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -707,12 +745,13 @@ declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmaxub: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmaxub %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -723,12 +762,13 @@ declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pminsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pminsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pminsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -739,12 +779,13 @@ declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pminub: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pminub %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pminub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -755,12 +796,13 @@ declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmulhrsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmulhrsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -771,12 +813,13 @@ declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmulhuw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmulhuw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -787,12 +830,13 @@ declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmulhw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmulhw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -803,12 +847,13 @@ declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmullw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmullw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmullw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -819,12 +864,13 @@ declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pmuludq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pmuludq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -835,12 +881,13 @@ declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_por: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: por %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -851,12 +898,13 @@ declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psadbw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psadbw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psadbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -867,14 +915,13 @@ declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pshufb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload -; CHECK-NEXT: pshufb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -885,13 +932,14 @@ declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pshufw(x86_mmx %a) { ; CHECK-LABEL: stack_fold_pshufw: ; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %mm0 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: pshufw $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload ; CHECK-NEXT: # mm0 = mem[1,0,0,0] -; CHECK-NEXT: movq2dq %mm0, %xmm0 +; CHECK-NEXT: movq %mm0, %rax ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone @@ -902,12 +950,13 @@ declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) { ; CHECK-LABEL: stack_fold_psignb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psignb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psignb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone @@ -918,12 +967,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) { ; CHECK-LABEL: stack_fold_psignd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psignd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psignd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone @@ -934,12 +984,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) { ; CHECK-LABEL: stack_fold_psignw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psignw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psignw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone @@ -950,12 +1001,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pslld: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pslld %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pslld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -966,12 +1018,13 @@ declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psllq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psllq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psllq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -982,12 +1035,13 @@ declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psllw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psllw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psllw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -998,12 +1052,13 @@ declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psrad: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrad %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrad {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1014,12 +1069,13 @@ declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psraw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psraw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psraw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1030,12 +1086,13 @@ declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psrld: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrld %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1046,12 +1103,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psrlq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrlq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrlq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1062,12 +1120,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psrlw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psrlw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psrlw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1078,12 +1137,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1094,12 +1154,13 @@ declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubd %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1110,12 +1171,13 @@ declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubq %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1126,12 +1188,13 @@ declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubsb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubsb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1142,12 +1205,13 @@ declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubsw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubsw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1158,12 +1222,13 @@ declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubusb: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubusb %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1174,12 +1239,13 @@ declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubusw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubusw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1190,12 +1256,13 @@ declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_psubw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: psubw %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: psubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1206,13 +1273,13 @@ declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_punpckhbw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1223,13 +1290,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_punpckhdq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[1],mem[1] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1240,13 +1307,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_punpckhwd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[2],mem[2],mm0[3],mem[3] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1257,13 +1324,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_punpcklbw: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1274,13 +1341,13 @@ declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_punpckldq: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[0],mem[0] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1291,13 +1358,13 @@ declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_punpcklwd: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: # mm0 = mm0[0],mem[0],mm0[1],mem[1] -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone @@ -1308,12 +1375,13 @@ declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) { ; CHECK-LABEL: stack_fold_pxor: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movq %rsi, %mm0 +; CHECK-NEXT: movq %rdi, %mm1 +; CHECK-NEXT: pxor %mm0, %mm1 +; CHECK-NEXT: movq %mm1, %rax ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: pxor {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload -; CHECK-NEXT: movq2dq %mm0, %xmm0 ; CHECK-NEXT: retq %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"() %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll index 672b4591316ce8..6fd90243a93033 100644 --- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll @@ -96,12 +96,13 @@ entry: define i32 @test3(x86_mmx %a) nounwind { ; X86-LABEL: test3: ; X86: # %bb.0: -; X86-NEXT: movd %mm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: test3: ; X64: # %bb.0: -; X64-NEXT: movd %mm0, %eax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq %tmp0 = bitcast x86_mmx %a to <2 x i32> %tmp1 = extractelement <2 x i32> %tmp0, i32 0 @@ -112,14 +113,12 @@ define i32 @test3(x86_mmx %a) nounwind { define i32 @test4(x86_mmx %a) nounwind { ; X86-LABEL: test4: ; X86: # %bb.0: -; X86-NEXT: movq2dq %mm0, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: test4: ; X64: # %bb.0: -; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %rdi, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll index cea047453de43e..aed8782ba40aa7 100644 --- a/llvm/test/CodeGen/X86/vec_insert-7.ll +++ b/llvm/test/CodeGen/X86/vec_insert-7.ll @@ -9,13 +9,12 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X86-LABEL: mmx_movzl: ; X86: ## %bb.0: ; X86-NEXT: movl $32, %eax -; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; ; X64-LABEL: mmx_movzl: ; X64: ## %bb.0: ; X64-NEXT: movl $32, %eax -; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: retq %tmp = bitcast x86_mmx %x to <2 x i32> %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll index f561a2a20e194f..c00417080fe361 100644 --- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll @@ -6,15 +6,15 @@ define x86_mmx @t0(i32 %A) nounwind { ; X86-LABEL: t0: ; X86: ## %bb.0: -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 -; X86-NEXT: pxor %mm0, %mm0 -; X86-NEXT: punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0] +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: t0: ; X64: ## %bb.0: ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X64-NEXT: psllq $32, %xmm0 +; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: retq %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1 %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll index 39b2b6225d8b10..dce1aa2bcd1d41 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll @@ -17,16 +17,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5:[0-9]+]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -57,16 +57,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -97,16 +97,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -137,16 +137,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -177,16 +177,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -217,16 +217,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -257,16 +257,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -297,16 +297,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -337,16 +337,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -377,16 +377,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -417,16 +417,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -457,16 +457,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -497,16 +497,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -537,23 +537,22 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx -; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]]) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]]) +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 @@ -586,23 +585,22 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <2 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <2 x i1> [[TMP12]] to <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to x86_mmx -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to x86_mmx -; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[TMP14]], x86_mmx [[TMP15]]) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]]) +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <4 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <4 x i16> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 @@ -635,23 +633,22 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx -; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]]) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]]) +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 @@ -681,17 +678,15 @@ define i64 @test73(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -717,17 +712,15 @@ define i64 @test72(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -751,17 +744,15 @@ define i64 @test72_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -787,13 +778,13 @@ define i64 @test71(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[TMP6]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[TMP2]], i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP4]] ; @@ -815,17 +806,15 @@ define i64 @test70(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -849,17 +838,15 @@ define i64 @test70_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -885,17 +872,15 @@ define i64 @test69(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -921,13 +906,13 @@ define i64 @test68(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[TMP6]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[TMP2]], i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP4]] ; @@ -949,17 +934,15 @@ define i64 @test67(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -985,17 +968,15 @@ define i64 @test66(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 3) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 3) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -1019,17 +1000,15 @@ define i64 @test66_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx -; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8 @@ -1056,20 +1035,21 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1100,20 +1080,21 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1144,18 +1125,21 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64 ; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP5]] ; @@ -1180,20 +1164,21 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1224,20 +1209,21 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1268,18 +1254,21 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64 ; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP5]] ; @@ -1304,20 +1293,21 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1348,20 +1338,21 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]]) +; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64> ; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -1394,16 +1385,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1434,16 +1425,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1474,16 +1465,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1514,16 +1505,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1554,16 +1545,16 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1592,16 +1583,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1632,16 +1623,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1672,22 +1663,22 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP12]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP18]] ; @@ -1716,16 +1707,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1756,16 +1747,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1796,16 +1787,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1836,16 +1827,16 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1872,13 +1863,16 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 ; CHECK-NEXT: store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; @@ -1907,16 +1901,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1947,16 +1941,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -1987,16 +1981,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2027,16 +2021,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2067,16 +2061,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2107,16 +2101,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2147,16 +2141,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2185,13 +2179,16 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]]) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]]) +; CHECK-NEXT: [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64 ; CHECK-NEXT: store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; @@ -2218,16 +2215,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2258,16 +2255,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2298,16 +2295,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2338,17 +2335,20 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP16]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 48 -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 -; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64 [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP17]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP14]] to i64 +; CHECK-NEXT: store i64 [[TMP15]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; entry: @@ -2374,16 +2374,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2414,16 +2414,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2454,16 +2454,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2494,16 +2494,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2534,16 +2534,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2574,16 +2574,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2612,16 +2612,19 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080 +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr [[TMP6]], align 1 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]] -; CHECK: 3: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: 7: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]] ; CHECK-NEXT: unreachable -; CHECK: 4: -; CHECK-NEXT: tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], x86_mmx [[MMX_VAR_I]]) #[[ATTR2]] +; CHECK: 8: +; CHECK-NEXT: tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], <1 x i64> [[MMX_VAR_I]]) #[[ATTR2]] ; CHECK-NEXT: ret void ; entry: @@ -2641,15 +2644,16 @@ define i32 @test24(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP6]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP7:%.*]], !prof [[PROF0]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[MMX_VAR_I]]) #[[ATTR2]] +; CHECK: 6: +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[MMX_VAR_I]]) #[[ATTR2]] ; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP1]] ; @@ -2674,21 +2678,23 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[D]] to <8 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP9]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]] -; CHECK: 9: +; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]] +; CHECK: 11: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 10: -; CHECK-NEXT: tail call void @llvm.x86.mmx.maskmovq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]] +; CHECK: 12: +; CHECK-NEXT: tail call void @llvm.x86.mmx.maskmovq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]] ; CHECK-NEXT: ret void ; entry: @@ -2713,16 +2719,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2750,16 +2756,17 @@ define i64 @test21(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK: 7: +; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 @@ -2783,16 +2790,17 @@ define i32 @test21_2(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK: 7: +; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 @@ -2821,13 +2829,14 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 -; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to i64 -; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] +; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; @@ -2851,15 +2860,16 @@ define <2 x double> @test19(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP7]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP7]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP5]] to i64 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] -; CHECK: 5: +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 6: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 6: -; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx [[TMP1]]) #[[ATTR5]] +; CHECK: 7: +; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> [[TMP8]]) #[[ATTR5]] ; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[TMP2]] ; @@ -2885,8 +2895,8 @@ define i64 @test18(<2 x double> %a) #0 { ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable ; CHECK: 3: -; CHECK-NEXT: [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 @@ -2915,8 +2925,8 @@ define i64 @test17(<2 x double> %a) #0 { ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable ; CHECK: 3: -; CHECK-NEXT: [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 @@ -2941,20 +2951,24 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0 -; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64> ; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0 -; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0 -; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[_MSPROP1]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64> +; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP4]] to i64 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP5]] to i64 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP12]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]] +; CHECK: 8: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]], i8 16) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64 +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]], i8 16) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 ; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i64 [[TMP3]] ; @@ -2978,13 +2992,13 @@ define i64 @test15(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx [[TMP1]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> [[TMP1]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP6]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -3010,13 +3024,13 @@ define i64 @test14(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx [[TMP1]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <4 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> [[TMP1]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP6]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -3042,13 +3056,13 @@ define i64 @test13(<1 x i64> %a) #0 { ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx [[TMP1]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <8 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> [[TMP1]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <8 x i8> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -3077,16 +3091,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3117,16 +3131,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3157,16 +3171,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3197,16 +3211,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3237,16 +3251,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3277,18 +3291,18 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> ; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP21]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <4 x i16> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64 [[TMP14]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 @@ -3321,16 +3335,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3361,16 +3375,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3401,16 +3415,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3441,16 +3455,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3481,16 +3495,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx -; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 @@ -3514,20 +3528,21 @@ define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 { ; ALL-NEXT: cvtpi2ps %mm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} ; CHECK-LABEL: define <4 x float> @test89( -; CHECK-SAME: <4 x float> [[A:%.*]], x86_mmx [[B:%.*]]) #[[ATTR4:[0-9]+]] { +; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] { ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP4]] to i64 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]] -; CHECK: 4: +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]] +; CHECK: 5: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: 5: -; CHECK-NEXT: [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], x86_mmx [[B]]) +; CHECK: 6: +; CHECK-NEXT: [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], <1 x i64> [[B]]) ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[C]] ; @@ -3561,8 +3576,8 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 { ; CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0 ; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0 ; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] @@ -3571,8 +3586,8 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 { ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable ; CHECK: 5: -; CHECK-NEXT: [[TMP1:%.*]] = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx [[TMP0]], i32 [[D]], i32 2) -; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to <1 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[TMP8]], i32 [[D]], i32 2) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP9]] to <1 x i64> ; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[TMP2]] ; @@ -3591,15 +3606,15 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]] ; CHECK: 3: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable ; CHECK: 4: -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx [[TMP0]], i32 2) +; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[TMP6]], i32 2) ; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret i32 [[TMP1]] ; diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll index 57d6003b3873f1..5197f3277ed80a 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll @@ -31,12 +31,12 @@ entry: } ; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw( -; CHECK: or i64 -; CHECK: bitcast i64 {{.*}} to <4 x i16> +; CHECK: or <1 x i64> +; CHECK: bitcast <1 x i64> {{.*}} to <4 x i16> ; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer ; CHECK: sext <4 x i1> {{.*}} to <4 x i16> -; CHECK: bitcast <4 x i16> {{.*}} to i64 -; CHECK: ret x86_mmx +; CHECK: bitcast <4 x i16> {{.*}} to <1 x i64> +; CHECK: ret <1 x i64> define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory { @@ -60,8 +60,8 @@ entry: } ; CHECK-LABEL: @Test_x86_mmx_psad_bw( -; CHECK: or i64 +; CHECK: or <1 x i64> ; CHECK: icmp ne i64 ; CHECK: sext i1 {{.*}} to i64 ; CHECK: lshr i64 {{.*}}, 48 -; CHECK: ret x86_mmx +; CHECK: ret <1 x i64> diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll index 52acbfe0a0e779..6ae03f288e2c0e 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll @@ -42,9 +42,9 @@ entry: ; CHECK: icmp ne {{.*}}[[S]], 0 ; CHECK: br ; CHECK: call void @__msan_warning_noreturn() -; CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi -; CHECK: store i64 0, {{.*}} @__msan_retval_tls -; CHECK: ret x86_mmx +; CHECK: call <1 x i64> @llvm.x86.sse.cvtps2pi +; CHECK: store <1 x i64> zeroinitializer, {{.*}} @__msan_retval_tls +; CHECK: ret <1 x i64> ; avx512 rounding conversion. diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll index 4f08ea7c00afee..1289abd63667ee 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll @@ -48,15 +48,14 @@ entry: } ; CHECK-LABEL: @Test_mmx_packuswb( -; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16> -; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16> +; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16> +; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16> ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16> ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16> -; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx -; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx -; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}} -; CHECK-DAG: bitcast x86_mmx {{.*}} to i64 -; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}} -; CHECK: ret x86_mmx +; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64> +; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64> +; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packsswb({{.*}} +; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packuswb({{.*}} +; CHECK: ret <1 x i64> diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll index 196285d910a6da..3c6c44194e3ac5 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll @@ -30,11 +30,11 @@ entry: ; CHECK-LABEL: @test_mmx ; CHECK: = icmp ne i64 {{.*}}, 0 -; CHECK: [[C:%.*]] = sext i1 {{.*}} to i64 -; CHECK: [[A:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d( -; CHECK: [[B:%.*]] = bitcast x86_mmx {{.*}}[[A]] to i64 -; CHECK: = or i64 {{.*}}[[B]], {{.*}}[[C]] -; CHECK: call x86_mmx @llvm.x86.mmx.psll.d( +; CHECK: [[B:%.*]] = sext i1 {{.*}} to i64 +; CHECK: [[C:%.*]] = bitcast i64 [[B]] to <1 x i64> +; CHECK: [[A:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d( +; CHECK: = or <1 x i64> {{.*}}[[A]], {{.*}}[[C]] +; CHECK: call <1 x i64> @llvm.x86.mmx.psll.d( ; CHECK: ret i64 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll index 63114288fc5810..9fbc39241d8e98 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll @@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) { ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[A0:%.*]]) ; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0) @@ -207,16 +207,6 @@ define i32 @undef_x86_avx2_pmovmskb() { ; Constant Folding (ZERO -> ZERO) ; -define i32 @zero_x86_mmx_pmovmskb() { -; CHECK-LABEL: @zero_x86_mmx_pmovmskb( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx)) -; CHECK-NEXT: ret i32 [[TMP1]] -; - %1 = bitcast <1 x i64> zeroinitializer to x86_mmx - %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1) - ret i32 %2 -} - define i32 @zero_x86_sse_movmsk_ps() { ; CHECK-LABEL: @zero_x86_sse_movmsk_ps( ; CHECK-NEXT: ret i32 0 @@ -271,7 +261,7 @@ define i32 @zero_x86_avx2_pmovmskb() { define i32 @fold_x86_mmx_pmovmskb() { ; CHECK-LABEL: @fold_x86_mmx_pmovmskb( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> to x86_mmx)) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> ) ; CHECK-NEXT: ret i32 [[TMP1]] ; %1 = bitcast <8 x i8> to x86_mmx diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll index 38a7391a1a1e37..d4ec9e3aae6795 100644 --- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll @@ -38,38 +38,6 @@ define <1 x i64> @d(i64 %y) { ret <1 x i64> %c } -define x86_mmx @e(<1 x i64> %y) { -; CHECK-LABEL: @e( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0 -; CHECK-NEXT: [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: ret x86_mmx [[C]] -; - %c = bitcast <1 x i64> %y to x86_mmx - ret x86_mmx %c -} - -define <1 x i64> @f(x86_mmx %y) { -; CHECK-LABEL: @f( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64 -; CHECK-NEXT: [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0 -; CHECK-NEXT: ret <1 x i64> [[C]] -; - %c = bitcast x86_mmx %y to <1 x i64> - ret <1 x i64> %c -} - -define double @g(x86_mmx %x) { -; CHECK-LABEL: @g( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double -; CHECK-NEXT: ret double [[TMP0]] -; -entry: - %0 = bitcast x86_mmx %x to <1 x i64> - %1 = bitcast <1 x i64> %0 to double - ret double %1 -} - ; FP source is ok. define <3 x i64> @bitcast_inselt_undef(double %x, i32 %idx) { @@ -137,19 +105,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) { ret <3 x i64> %i } -; Negative test - source type must be scalar - -define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) { -; CHECK-LABEL: @bitcast_inselt_undef_from_mmx( -; CHECK-NEXT: [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64 -; CHECK-NEXT: [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]] -; CHECK-NEXT: ret <3 x i64> [[I]] -; - %xb = bitcast x86_mmx %x to i64 - %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx - ret <3 x i64> %i -} - ; Reduce number of casts define <2 x i64> @PR45748(double %x, double %y) { diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll index 8b8325b1472637..f787b3c4cc9ac2 100644 --- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll +++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll @@ -38,37 +38,6 @@ define <1 x i64> @d(i64 %y) { ret <1 x i64> %c } -define x86_mmx @e(<1 x i64> %y) { -; CHECK-LABEL: @e( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0 -; CHECK-NEXT: [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx -; CHECK-NEXT: ret x86_mmx [[C]] -; - %c = bitcast <1 x i64> %y to x86_mmx - ret x86_mmx %c -} - -define <1 x i64> @f(x86_mmx %y) { -; CHECK-LABEL: @f( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64 -; CHECK-NEXT: [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0 -; CHECK-NEXT: ret <1 x i64> [[C]] -; - %c = bitcast x86_mmx %y to <1 x i64> - ret <1 x i64> %c -} - -define double @g(x86_mmx %x) { -; CHECK-LABEL: @g( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double -; CHECK-NEXT: ret double [[TMP0]] -; -entry: - %0 = bitcast x86_mmx %x to <1 x i64> - %1 = bitcast <1 x i64> %0 to double - ret double %1 -} ; FP source is ok. @@ -137,19 +106,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) { ret <3 x i64> %i } -; Negative test - source type must be scalar - -define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) { -; CHECK-LABEL: @bitcast_inselt_undef_from_mmx( -; CHECK-NEXT: [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64 -; CHECK-NEXT: [[I:%.*]] = insertelement <3 x i64> undef, i64 [[XB]], i32 [[IDX:%.*]] -; CHECK-NEXT: ret <3 x i64> [[I]] -; - %xb = bitcast x86_mmx %x to i64 - %i = insertelement <3 x i64> undef, i64 %xb, i32 %idx - ret <3 x i64> %i -} - ; Reduce number of casts define <2 x i64> @PR45748(double %x, double %y) { diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll index d4c49faf91b091..dd75560e25ceda 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll @@ -335,19 +335,6 @@ define { i64, i64 } @test_load_struct() { ret { i64, i64 } %v } -@m64 = internal constant [2 x i64] zeroinitializer -@idx = external global i32 - -; This should not try to create an x86_mmx null value. -define x86_mmx @load_mmx() { -; CHECK-LABEL: @load_mmx( -; CHECK-NEXT: [[TEMP:%.*]] = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)), align 8 -; CHECK-NEXT: ret x86_mmx [[TEMP]] -; - %temp = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)) - ret x86_mmx %temp -} - @g_offset = external global i64 @g_neg_one_vec = constant <4 x i8> diff --git a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll b/llvm/test/Transforms/LoopUnroll/X86/mmx.ll deleted file mode 100644 index b460b79d0640aa..00000000000000 --- a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll +++ /dev/null @@ -1,35 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 -; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define x86_mmx @f() #0 { -; CHECK-LABEL: define x86_mmx @f -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_7:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[PHI]], 7 -; CHECK-NEXT: [[ADD_7]] = add i32 [[PHI]], 8 -; CHECK-NEXT: [[CMP_7:%.*]] = icmp eq i32 [[ADD_6]], 0 -; CHECK-NEXT: br i1 [[CMP_7]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: exit: -; CHECK-NEXT: [[RET:%.*]] = phi x86_mmx [ undef, [[FOR_BODY]] ] -; CHECK-NEXT: ret x86_mmx [[RET]] -; -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %phi = phi i32 [ 1, %entry ], [ %add, %for.body ] - %add = add i32 %phi, 1 - %cmp = icmp eq i32 %phi, 0 - br i1 %cmp, label %exit, label %for.body - -exit: ; preds = %for.body - %ret = phi x86_mmx [ undef, %for.body ] - ret x86_mmx %ret -} - -attributes #0 = { "target-cpu"="x86-64" } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll index 7476ddb0fb873d..19ca68fc9cb2cd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll @@ -4,68 +4,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define void @test1(x86_mmx %a, x86_mmx %b, ptr %ptr) { -; Ensure we can handle x86_mmx values which are primitive and can be bitcast -; with integer types but can't be put into a vector. -; -; CHECK-LABEL: @test1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64 -; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64 -; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42 -; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i32 1 -; CHECK-NEXT: store i64 [[A_AND]], ptr [[PTR]], align 8 -; CHECK-NEXT: store i64 [[B_AND]], ptr [[GEP]], align 8 -; CHECK-NEXT: ret void -; -entry: - %a.cast = bitcast x86_mmx %a to i64 - %b.cast = bitcast x86_mmx %b to i64 - %a.and = and i64 %a.cast, 42 - %b.and = and i64 %b.cast, 42 - %gep = getelementptr i64, ptr %ptr, i32 1 - store i64 %a.and, ptr %ptr - store i64 %b.and, ptr %gep - ret void -} - -define void @test2(x86_mmx %a, x86_mmx %b) { -; Same as @test1 but using phi-input vectorization instead of store -; vectorization. -; -; CHECK-LABEL: @test2( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[EXIT:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64 -; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64 -; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42 -; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42 -; CHECK-NEXT: br label [[EXIT]] -; CHECK: exit: -; CHECK-NEXT: [[A_PHI:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[A_AND]], [[IF_THEN]] ] -; CHECK-NEXT: [[B_PHI:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[B_AND]], [[IF_THEN]] ] -; CHECK-NEXT: tail call void @f(i64 [[A_PHI]], i64 [[B_PHI]]) -; CHECK-NEXT: ret void -; -entry: - br i1 undef, label %if.then, label %exit - -if.then: - %a.cast = bitcast x86_mmx %a to i64 - %b.cast = bitcast x86_mmx %b to i64 - %a.and = and i64 %a.cast, 42 - %b.and = and i64 %b.cast, 42 - br label %exit - -exit: - %a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ] - %b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ] - tail call void @f(i64 %a.phi, i64 %b.phi) - ret void -} - define i8 @test3(ptr %addr) { ; Check that we do not vectorize types that are padded to a bigger ones. ; diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll index 1bf1ad7ee934a5..dbcb6d07849717 100644 --- a/llvm/test/Transforms/SROA/pr57796.ll +++ b/llvm/test/Transforms/SROA/pr57796.ll @@ -17,9 +17,9 @@ define void @foo() { ; CHECK-NEXT: [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]]) ; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32 ; CHECK-NEXT: [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx -; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0) -; CHECK-NEXT: store x86_mmx [[TMP2]], ptr @A, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP1]], i8 0) +; CHECK-NEXT: store <1 x i64> [[TMP2]], ptr @A, align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp index 6fa36421810f0f..1e77930933c7a2 100644 --- a/llvm/tools/llvm-c-test/echo.cpp +++ b/llvm/tools/llvm-c-test/echo.cpp @@ -153,8 +153,6 @@ struct TypeCloner { return LLVMMetadataTypeInContext(Ctx); case LLVMX86_AMXTypeKind: return LLVMX86AMXTypeInContext(Ctx); - case LLVMX86_MMXTypeKind: - return LLVMX86MMXTypeInContext(Ctx); case LLVMTokenTypeKind: return LLVMTokenTypeInContext(Ctx); case LLVMTargetExtTypeKind: { diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp index 758643f1414c1e..80fb21038d304f 100644 --- a/llvm/tools/llvm-stress/llvm-stress.cpp +++ b/llvm/tools/llvm-stress/llvm-stress.cpp @@ -173,8 +173,6 @@ struct Modifier { Ty = Type::getX86_FP80Ty(Context); else if (Arg == "ppc_fp128") Ty = Type::getPPC_FP128Ty(Context); - else if (Arg == "x86_mmx") - Ty = Type::getX86_MMXTy(Context); else if (Arg.starts_with("i")) { unsigned N = 0; Arg.drop_front().getAsInteger(10, N); @@ -294,11 +292,7 @@ struct Modifier { /// Pick a random vector type. Type *pickVectorType(VectorType *VTy = nullptr) { - // Vectors of x86mmx are illegal; keep trying till we get something else. - Type *Ty; - do { - Ty = pickScalarType(); - } while (Ty->isX86_MMXTy()); + Type *Ty = pickScalarType(); if (VTy) return VectorType::get(Ty, VTy->getElementCount()); diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp index 4c1e9a9acb29ac..44b25035dde2c5 100644 --- a/llvm/unittests/IR/InstructionsTest.cpp +++ b/llvm/unittests/IR/InstructionsTest.cpp @@ -205,7 +205,6 @@ TEST(InstructionsTest, CastInst) { Type *Int64Ty = Type::getInt64Ty(C); Type *V8x8Ty = FixedVectorType::get(Int8Ty, 8); Type *V8x64Ty = FixedVectorType::get(Int64Ty, 8); - Type *X86MMXTy = Type::getX86_MMXTy(C); Type *HalfTy = Type::getHalfTy(C); Type *FloatTy = Type::getFloatTy(C); @@ -248,9 +247,6 @@ TEST(InstructionsTest, CastInst) { EXPECT_EQ(CastInst::Trunc, CastInst::getCastOpcode(c64, true, V8x8Ty, true)); EXPECT_EQ(CastInst::SExt, CastInst::getCastOpcode(c8, true, V8x64Ty, true)); - EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, X86MMXTy)); - EXPECT_FALSE(CastInst::isBitCastable(X86MMXTy, V8x8Ty)); - EXPECT_FALSE(CastInst::isBitCastable(Int64Ty, X86MMXTy)); EXPECT_FALSE(CastInst::isBitCastable(V8x64Ty, V8x8Ty)); EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, V8x64Ty)); @@ -1745,7 +1741,7 @@ TEST(InstructionsTest, AllocaInst) { %A = alloca i32, i32 1 %B = alloca i32, i32 4 %C = alloca i32, i32 %n - %D = alloca <8 x double> + %D = alloca double %E = alloca %F = alloca [2 x half] %G = alloca [2 x [3 x i128]] @@ -1771,7 +1767,8 @@ TEST(InstructionsTest, AllocaInst) { EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::getFixed(32)); EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::getFixed(128)); EXPECT_FALSE(C.getAllocationSizeInBits(DL)); - EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(512)); + EXPECT_EQ(DL.getTypeSizeInBits(D.getAllocatedType()), TypeSize::getFixed(64)); + EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(64)); EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::getScalable(512)); EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::getFixed(32)); EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::getFixed(768)); diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md index bc0f484108facf..fadc81b567b4e4 100644 --- a/mlir/docs/Dialects/LLVM.md +++ b/mlir/docs/Dialects/LLVM.md @@ -240,8 +240,6 @@ dialect as there is no corresponding built-in type. The following non-parametric types derived from the LLVM IR are available in the LLVM dialect: -- `!llvm.x86_mmx` (`LLVMX86MMXType`) - value held in an MMX register on x86 - machine. - `!llvm.ppc_fp128` (`LLVMPPCFP128Type`) - 128-bit floating-point value (two 64 bits). - `!llvm.token` (`LLVMTokenType`) - a non-inspectable value associated with an diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h index 93733ccd4929ae..1befdfa74f67c5 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h @@ -67,7 +67,6 @@ namespace LLVM { DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType, "llvm.void"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type, "llvm.ppc_fp128"); -DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType, "llvm.x86_mmx"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType, "llvm.token"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType, "llvm.label"); DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata"); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 9372caf6e32a73..c08360fc6101da 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -3132,7 +3132,6 @@ void LLVMDialect::initialize() { // clang-format off addTypes(type) .Case([&](Type) { return "void"; }) .Case([&](Type) { return "ppc_fp128"; }) - .Case([&](Type) { return "x86_mmx"; }) .Case([&](Type) { return "token"; }) .Case([&](Type) { return "label"; }) .Case([&](Type) { return "metadata"; }) @@ -309,7 +308,6 @@ static Type dispatchParse(AsmParser &parser, bool allowAny = true) { return StringSwitch>(key) .Case("void", [&] { return LLVMVoidType::get(ctx); }) .Case("ppc_fp128", [&] { return LLVMPPCFP128Type::get(ctx); }) - .Case("x86_mmx", [&] { return LLVMX86MMXType::get(ctx); }) .Case("token", [&] { return LLVMTokenType::get(ctx); }) .Case("label", [&] { return LLVMLabelType::get(ctx); }) .Case("metadata", [&] { return LLVMMetadataType::get(ctx); }) diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp index cf3f38b7101307..e536c4a792732c 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp @@ -781,7 +781,6 @@ bool mlir::LLVM::isCompatibleOuterType(Type type) { LLVMScalableVectorType, LLVMTargetExtType, LLVMVoidType, - LLVMX86MMXType >(type)) { // clang-format on return true; @@ -844,7 +843,6 @@ static bool isCompatibleImpl(Type type, DenseSet &compatibleTypes) { LLVMPPCFP128Type, LLVMTokenType, LLVMVoidType, - LLVMX86MMXType >([](Type) { return true; }) // clang-format on .Default([](Type) { return false; }); @@ -986,8 +984,7 @@ llvm::TypeSize mlir::LLVM::getPrimitiveTypeSizeInBits(Type type) { .Case( [](Type) { return llvm::TypeSize::getFixed(16); }) .Case([](Type) { return llvm::TypeSize::getFixed(32); }) - .Case( - [](Type) { return llvm::TypeSize::getFixed(64); }) + .Case([](Type) { return llvm::TypeSize::getFixed(64); }) .Case([](Type) { return llvm::TypeSize::getFixed(80); }) .Case([](Type) { return llvm::TypeSize::getFixed(128); }) .Case([](IntegerType intTy) { diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp index a4db958207756c..db184ae8e6e833 100644 --- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp +++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp @@ -65,8 +65,6 @@ class TypeFromLLVMIRTranslatorImpl { return Float80Type::get(&context); if (type->isPPC_FP128Ty()) return LLVM::LLVMPPCFP128Type::get(&context); - if (type->isX86_MMXTy()) - return LLVM::LLVMX86MMXType::get(&context); if (type->isLabelTy()) return LLVM::LLVMLabelType::get(&context); if (type->isMetadataTy()) diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp index 6d8b415ff09dce..65915027238801 100644 --- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp +++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp @@ -58,9 +58,6 @@ class TypeToLLVMIRTranslatorImpl { .Case([this](LLVM::LLVMPPCFP128Type) { return llvm::Type::getPPC_FP128Ty(context); }) - .Case([this](LLVM::LLVMX86MMXType) { - return llvm::Type::getX86_MMXTy(context); - }) .Case([this](LLVM::LLVMTokenType) { return llvm::Type::getTokenTy(context); }) diff --git a/mlir/test/Dialect/LLVMIR/types.mlir b/mlir/test/Dialect/LLVMIR/types.mlir index 2dd292408fa60d..42d370a5477c23 100644 --- a/mlir/test/Dialect/LLVMIR/types.mlir +++ b/mlir/test/Dialect/LLVMIR/types.mlir @@ -6,8 +6,6 @@ func.func @primitive() { "some.op"() : () -> !llvm.void // CHECK: !llvm.ppc_fp128 "some.op"() : () -> !llvm.ppc_fp128 - // CHECK: !llvm.x86_mmx - "some.op"() : () -> !llvm.x86_mmx // CHECK: !llvm.token "some.op"() : () -> !llvm.token // CHECK: !llvm.label diff --git a/mlir/test/Target/LLVMIR/llvmir-types.mlir b/mlir/test/Target/LLVMIR/llvmir-types.mlir index c85fa0101c00d7..3e533211b0d0c4 100644 --- a/mlir/test/Target/LLVMIR/llvmir-types.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-types.mlir @@ -20,8 +20,6 @@ llvm.func @return_fp128() -> f128 llvm.func @return_x86_fp80() -> f80 // CHECK: declare ppc_fp128 @return_ppc_fp128() llvm.func @return_ppc_fp128() -> !llvm.ppc_fp128 -// CHECK: declare x86_mmx @return_x86_mmx() -llvm.func @return_x86_mmx() -> !llvm.x86_mmx // // Functions. From 4b625fdb92e4a13a3cbb0b20d40d3c81bddb7cc4 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Wed, 10 Jul 2024 19:04:31 -0400 Subject: [PATCH 3/8] Move the MMX intrinsic cast hack from generic SelectionDAGBuilder to X86 DAGCombine. --- .../SelectionDAG/SelectionDAGBuilder.cpp | 16 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 92 ++++++++++++- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 121 +++++++++++++++++- llvm/test/CodeGen/X86/mmx-arith.ll | 73 ++++++----- llvm/test/CodeGen/X86/mmx-cvt.ll | 98 ++------------ llvm/test/CodeGen/X86/pr29222.ll | 6 +- 6 files changed, 271 insertions(+), 135 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 54f7f127ae663e..b19047e03b149d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5273,21 +5273,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, DAG.getTargetConstantFP(*cast(Arg), SDLoc(), VT)); } } - if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) { - for (SDValue &Op : Ops) { - if (Op.getValueType() == MVT::v1i64) - Op = DAG.getBitcast(MVT::x86mmx, Op); - } - } SmallVector ValueVTs; ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); - if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) { - if (ValueVTs.size() == 1 && ValueVTs[0] == MVT::v1i64) - ValueVTs[0] = MVT::x86mmx; - } - if (HasChain) ValueVTs.push_back(MVT::Other); @@ -5356,11 +5345,6 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } } - if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) { - if (Result.getValueType() == MVT::x86mmx) - Result = DAG.getBitcast(MVT::v1i64, Result); - } - setValue(&I, Result); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index fa26849e0bc5a1..18f8c1a22ff5a9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2565,7 +2565,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND, ISD::FP_ROUND, - ISD::STRICT_FP_ROUND}); + ISD::STRICT_FP_ROUND, + ISD::INTRINSIC_VOID, + ISD::INTRINSIC_WO_CHAIN, + ISD::INTRINSIC_W_CHAIN}); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -27299,6 +27302,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, llvm_unreachable("Unsupported truncstore intrinsic"); } } + case INTR_TYPE_CAST_MMX: + return SDValue(); // handled in combineINTRINSIC_* } } @@ -57594,6 +57599,86 @@ static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>, +// and so SelectionDAGBuilder creates them with v1i64 types, but they need to +// use x86mmx instead. +static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + + bool MadeChange = false, CastReturnVal = false; + SmallVector Args; + for (const SDValue &Arg : N->op_values()) { + if (Arg.getValueType() == MVT::v1i64) { + MadeChange = true; + Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg)); + } else + Args.push_back(Arg); + } + SDVTList VTs = N->getVTList(); + SDVTList NewVTs = VTs; + if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) { + SmallVector NewVTArr(ArrayRef(VTs.VTs, VTs.NumVTs)); + NewVTArr[0] = MVT::x86mmx; + NewVTs = DAG.getVTList(NewVTArr); + MadeChange = true; + CastReturnVal = true; + } + + if (MadeChange) { + SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args); + if (CastReturnVal) { + SmallVector Returns; + for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i) + Returns.push_back(Result.getValue(i)); + Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]); + return DAG.getMergeValues(Returns, dl); + } + return Result; + } + return SDValue(); +} +static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned IntNo = N->getConstantOperandVal(0); + const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo); + + if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX) + return FixupMMXIntrinsicTypes(N, DAG); + + return SDValue(); +} + +static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned IntNo = N->getConstantOperandVal(1); + const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); + + if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX) + return FixupMMXIntrinsicTypes(N, DAG); + + return SDValue(); +} + +static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned IntNo = N->getConstantOperandVal(1); + const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); + + if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX) + return FixupMMXIntrinsicTypes(N, DAG); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -57784,7 +57869,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); - // clang-format on + case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI); + case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI); + case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI); + // clang-format on } return SDValue(); diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 717541cf6c559b..a336ea75afdae3 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -73,7 +73,8 @@ enum IntrinsicType : uint16_t { GATHER_AVX2, ROUNDP, ROUNDS, - RDPRU + RDPRU, + INTR_TYPE_CAST_MMX }; struct IntrinsicData { @@ -323,6 +324,8 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(mmx_maskmovq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_movnt_dq, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0), X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), @@ -352,6 +355,31 @@ static const IntrinsicData *getIntrinsicWithChain(unsigned IntNo) { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(3dnow_pavgusb, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pf2id, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfacc, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfadd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfcmpeq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfcmpge, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfcmpgt, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfmax, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfmin, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfmul, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfrcp, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfrcpit1, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfrcpit2, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfrsqit1, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfrsqrt, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfsub, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pfsubr, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pi2fd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnow_pmulhrw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnowa_pf2iw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnowa_pfnacc, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnowa_pfpnacc, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnowa_pi2fw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(3dnowa_pswapd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(addcarry_32, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), @@ -1495,6 +1523,75 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + + X86_INTRINSIC_DATA(mmx_packssdw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_packsswb, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_packuswb, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padd_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padds_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_padds_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_paddus_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_paddus_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_palignr_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pand, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pandn, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pavg_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pavg_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpeq_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpeq_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpeq_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpgt_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpgt_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pcmpgt_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pextr_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pinsr_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmadd_wd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmaxs_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmaxu_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmins_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pminu_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmovmskb, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmulh_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmulhu_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmull_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pmulu_dq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_por, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psad_bw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psll_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psll_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psll_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pslli_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pslli_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pslli_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psra_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psra_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrai_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrai_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrl_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrl_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrl_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrli_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrli_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psrli_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_q, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psub_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubs_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubs_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubus_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_psubus_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckhbw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckhdq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckhwd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpcklbw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpckldq, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_punpcklwd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(mmx_pxor, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), @@ -1503,8 +1600,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE), X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT), X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse_cvtpd2pi, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvtpi2pd, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvtpi2ps, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvtps2pi, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(sse_cvtss2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), X86_INTRINSIC_DATA(sse_cvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0), + X86_INTRINSIC_DATA(sse_cvttpd2pi, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(sse_cvttps2pi, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(sse_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0), X86_INTRINSIC_DATA(sse_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0), X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), @@ -1512,6 +1615,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0), X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), + X86_INTRINSIC_DATA(sse_pshuf_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), @@ -1593,14 +1697,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0), X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0), + X86_INTRINSIC_DATA(ssse3_pabs_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_pabs_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_pabs_w, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(ssse3_pmul_hr_sw, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(ssse3_pshuf_b, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(ssse3_psign_b, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_psign_d, INTR_TYPE_CAST_MMX, 0, 0), + X86_INTRINSIC_DATA(ssse3_psign_w, INTR_TYPE_CAST_MMX, 0, 0), X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0), diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 68287a4feee47f..230e763a7c7340 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -18,8 +18,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X86-NEXT: paddsb (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: paddusb (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: psubb %xmm1, %xmm0 ; X86-NEXT: movdq2q %xmm0, %mm0 @@ -27,8 +27,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X86-NEXT: psubsb (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: psubusb (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -58,8 +58,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X64-NEXT: paddsb (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: paddusb (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: psubb %xmm1, %xmm0 ; X64-NEXT: movdq2q %xmm0, %mm0 @@ -67,8 +67,8 @@ define void @test0(ptr %A, ptr %B) nounwind { ; X64-NEXT: psubsb (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: psubusb (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -239,8 +239,13 @@ entry: define void @test2(ptr %A, ptr %B) nounwind { ; X86-LABEL: test2: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: paddw %xmm0, %xmm1 @@ -249,8 +254,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X86-NEXT: paddsw (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: paddusw (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: psubw %xmm1, %xmm0 ; X86-NEXT: movdq2q %xmm0, %mm0 @@ -258,8 +263,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X86-NEXT: psubsw (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: psubusw (%ecx), %mm0 -; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X86-NEXT: pmullw %xmm0, %xmm1 ; X86-NEXT: movdq2q %xmm1, %mm0 @@ -267,18 +272,26 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X86-NEXT: pmulhw (%ecx), %mm0 ; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: pmaddwd (%ecx), %mm0 +; X86-NEXT: movq %mm0, (%esp) +; X86-NEXT: movl (%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: movq2dq %mm0, %xmm0 -; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: andps %xmm0, %xmm1 -; X86-NEXT: movlps %xmm1, (%eax) -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: orps %xmm1, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) -; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: xorps %xmm0, %xmm1 -; X86-NEXT: movlps %xmm1, (%eax) +; X86-NEXT: andl 4(%ecx), %esi +; X86-NEXT: movd %esi, %xmm0 +; X86-NEXT: andl (%ecx), %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: movq %xmm1, (%eax) +; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: por %xmm1, %xmm0 +; X86-NEXT: movq %xmm0, (%eax) +; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: pxor %xmm0, %xmm1 +; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: emms +; X86-NEXT: leal -4(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: test2: @@ -291,8 +304,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X64-NEXT: paddsw (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: paddusw (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: psubw %xmm1, %xmm0 ; X64-NEXT: movdq2q %xmm0, %mm0 @@ -300,8 +313,8 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X64-NEXT: psubsw (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: psubusw (%rsi), %mm0 -; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq2dq %mm0, %xmm0 +; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: pmullw %xmm0, %xmm1 ; X64-NEXT: movdq2q %xmm1, %mm0 @@ -309,17 +322,17 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X64-NEXT: pmulhw (%rsi), %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: pmaddwd (%rsi), %mm0 +; X64-NEXT: movq %mm0, %rax ; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: andps %xmm0, %xmm1 -; X64-NEXT: movlps %xmm1, (%rdi) -; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: orps %xmm1, %xmm0 -; X64-NEXT: movlps %xmm0, (%rdi) -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: xorps %xmm0, %xmm1 -; X64-NEXT: movlps %xmm1, (%rdi) +; X64-NEXT: andq (%rsi), %rax +; X64-NEXT: movq %rax, %xmm0 +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movq %xmm1, (%rdi) +; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll index c09c417c11c966..11473f3f6c2363 100644 --- a/llvm/test/CodeGen/X86/mmx-cvt.ll +++ b/llvm/test/CodeGen/X86/mmx-cvt.ll @@ -8,20 +8,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind { ; X86-LABEL: cvt_v2f64_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvtpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvt_v2f64_v2i32: @@ -44,20 +34,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind { define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind { ; X86-LABEL: cvtt_v2f64_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvtt_v2f64_v2i32: @@ -80,20 +60,10 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind { define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind { ; X86-LABEL: fptosi_v2f64_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: fptosi_v2f64_v2i32: @@ -114,20 +84,10 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind { define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind { ; X86-LABEL: cvt_v2f32_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvtps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvt_v2f32_v2i32: @@ -150,20 +110,10 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind { define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind { ; X86-LABEL: cvtt_v2f32_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: cvtt_v2f32_v2i32: @@ -186,20 +136,10 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind { define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind { ; X86-LABEL: fptosi_v4f32_v4i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: fptosi_v4f32_v4i32: @@ -221,20 +161,10 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind { define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind { ; X86-LABEL: fptosi_v2f32_v2i32: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq %mm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: fptosi_v2f32_v2i32: diff --git a/llvm/test/CodeGen/X86/pr29222.ll b/llvm/test/CodeGen/X86/pr29222.ll index 9a38515b65594c..1ddcb1fb56524c 100644 --- a/llvm/test/CodeGen/X86/pr29222.ll +++ b/llvm/test/CodeGen/X86/pr29222.ll @@ -32,7 +32,7 @@ define i32 @PR29222(i32) nounwind { ; X86-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X86-AVX-NEXT: packsswb %mm0, %mm0 ; X86-AVX-NEXT: movq %mm0, (%esp) -; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vpbroadcastq (%esp), %xmm0 ; X86-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; X86-AVX-NEXT: vmovd %xmm0, %eax ; X86-AVX-NEXT: movl %ebp, %esp @@ -54,7 +54,9 @@ define i32 @PR29222(i32) nounwind { ; X64-AVX-NEXT: movd %edi, %mm0 ; X64-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] ; X64-AVX-NEXT: packsswb %mm0, %mm0 -; X64-AVX-NEXT: movq2dq %mm0, %xmm0 +; X64-AVX-NEXT: movq %mm0, %rax +; X64-AVX-NEXT: vmovq %rax, %xmm0 +; X64-AVX-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; X64-AVX-NEXT: vmovd %xmm0, %eax ; X64-AVX-NEXT: retq From 98a8b237810812c2f429aa5555fa8de65a1e0962 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Thu, 11 Jul 2024 13:10:51 -0400 Subject: [PATCH 4/8] Add release notes and adjust documentation for change. Fix a couple nits. --- clang/lib/CodeGen/Targets/X86.cpp | 4 ++-- llvm/docs/BitCodeFormat.rst | 2 +- llvm/docs/LangRef.rst | 20 +------------------ llvm/docs/ReleaseNotes.rst | 13 ++++++++++++ .../SelectionDAG/SelectionDAGBuilder.cpp | 1 - 5 files changed, 17 insertions(+), 23 deletions(-) diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp index 8913b188f6aec3..16d52bee3490b7 100644 --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -24,9 +24,9 @@ bool IsX86_MMXType(llvm::Type *IRType) { IRType->getScalarSizeInBits() != 64; } -static llvm::Type* X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF, +static llvm::Type *X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF, StringRef Constraint, - llvm::Type* Ty) { + llvm::Type *Ty) { if (Constraint == "k") { llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGF.getLLVMContext()); return llvm::FixedVectorType::get(Int1Ty, Ty->getScalarSizeInBits()); diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst index 46af2e421a258c..1a724a58f58e02 100644 --- a/llvm/docs/BitCodeFormat.rst +++ b/llvm/docs/BitCodeFormat.rst @@ -1291,7 +1291,7 @@ TYPE_CODE_X86_MMX Record ``[X86_MMX]`` -The ``X86_MMX`` record (code 17) adds an ``x86_mmx`` type to the type table. +The ``X86_MMX`` record (code 17) is deprecated, and imported as a <1 x i64> vector. TYPE_CODE_STRUCT_ANON Record ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index ae39217dc8ff8e..2ff5a35ad650b6 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3945,24 +3945,6 @@ or constants of this type. x86_amx -X86_mmx Type -"""""""""""" - -:Overview: - -The x86_mmx type represents a value held in an MMX register on an x86 -machine. The operations allowed on it are quite limited: parameters and -return values, load and store, and bitcast. User-specified MMX -instructions are represented as intrinsic or asm calls with arguments -and/or results of this type. There are no arrays, vectors or constants -of this type. - -:Syntax: - -:: - - x86_mmx - .. _t_pointer: @@ -4396,7 +4378,7 @@ represented by ``0xH`` followed by 4 hexadecimal digits. The bfloat 16-bit format is represented by ``0xR`` followed by 4 hexadecimal digits. All hexadecimal formats are big-endian (sign bit at the left). -There are no constants of type x86_mmx and x86_amx. +There are no constants of type x86_amx. .. _complexconstants: diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 4f442bde239a9f..0060917259b8f1 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -79,6 +79,8 @@ Changes to the LLVM IR * ``llvm.instprof.mcdc.tvbitmap.update``: 3rd argument has been removed. The next argument has been changed from byte index to bit index. +* The ``x86_mmx`` IR type has been removed. It will be translated to + the standard vector type ``<1 x i64>`` in bitcode upgrade. Changes to LLVM infrastructure ------------------------------ @@ -209,6 +211,11 @@ Changes to the X86 Backend - Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1, while assembly encoding/decoding supports are kept. +- Due to the removal of the ``x86_mmx`` IR type, functions with + ``x86_mmx`` arguments or return values will use a different, + incompatible, calling convention ABI. Such functions are not + generally seen in the wild (Clang never generates them!), so this is + not expected to result in real-world compatibility problems. Changes to the OCaml bindings ----------------------------- @@ -301,6 +308,12 @@ They are described in detail in the `debug info migration guide doesNotAccessMemory(); bool OnlyLoad = HasChain && F->onlyReadsMemory(); From 155e6e5e3809c866597ac84de4904b2f80f69fe8 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Sun, 14 Jul 2024 12:08:02 -0400 Subject: [PATCH 5/8] format --- llvm/lib/IR/Type.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 18a547e75fe1e4..9ddccce7f959c7 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -44,8 +44,7 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) { case FP128TyID : return getFP128Ty(C); case PPC_FP128TyID : return getPPC_FP128Ty(C); case LabelTyID : return getLabelTy(C); - case MetadataTyID: - return getMetadataTy(C); + case MetadataTyID : return getMetadataTy(C); case X86_AMXTyID : return getX86_AMXTy(C); case TokenTyID : return getTokenTy(C); default: From cf33ae4a0ae5366ede417b72c2f73590b5f9c315 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Mon, 22 Jul 2024 15:28:58 -0400 Subject: [PATCH 6/8] Fix logical conflicts after merge. --- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index a336ea75afdae3..685daca360e082 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -355,31 +355,6 @@ static const IntrinsicData *getIntrinsicWithChain(unsigned IntNo) { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithoutChain[] = { - X86_INTRINSIC_DATA(3dnow_pavgusb, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pf2id, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfacc, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfadd, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfcmpeq, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfcmpge, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfcmpgt, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfmax, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfmin, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfmul, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfrcp, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfrcpit1, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfrcpit2, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfrsqit1, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfrsqrt, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfsub, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pfsubr, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pi2fd, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnow_pmulhrw, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnowa_pf2iw, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnowa_pfnacc, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnowa_pfpnacc, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnowa_pi2fw, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(3dnowa_pswapd, INTR_TYPE_CAST_MMX, 0, 0), - X86_INTRINSIC_DATA(addcarry_32, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), From 5032ddf3f57eeb91ee8c66d5e741a0e042a04ca2 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Mon, 22 Jul 2024 15:34:42 -0400 Subject: [PATCH 7/8] Fix stray comma in mlir. --- mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp index e536c4a792732c..dc7aef8ef7f850 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp @@ -780,7 +780,7 @@ bool mlir::LLVM::isCompatibleOuterType(Type type) { LLVMFixedVectorType, LLVMScalableVectorType, LLVMTargetExtType, - LLVMVoidType, + LLVMVoidType >(type)) { // clang-format on return true; @@ -842,7 +842,7 @@ static bool isCompatibleImpl(Type type, DenseSet &compatibleTypes) { LLVMMetadataType, LLVMPPCFP128Type, LLVMTokenType, - LLVMVoidType, + LLVMVoidType >([](Type) { return true; }) // clang-format on .Default([](Type) { return false; }); From 1feda655630522adf86a2d71c67478ee9fc0b58f Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Wed, 24 Jul 2024 22:18:28 -0400 Subject: [PATCH 8/8] Fix test after merge. --- llvm/test/CodeGen/X86/fast-isel-nontemporal.ll | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll index fd9f4fa63a090e..c13fdae540d0b8 100644 --- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll +++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll @@ -94,7 +94,6 @@ entry: ; ; MMX Store -; Note: doesn't actually emit a non-temporal store here. ; define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) { @@ -102,7 +101,7 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) { ; ALL: # %bb.0: # %entry ; ALL-NEXT: movq (%rdi), %mm0 ; ALL-NEXT: psrlq $3, %mm0 -; ALL-NEXT: movq %mm0, (%rsi) +; ALL-NEXT: movntq %mm0, (%rsi) ; ALL-NEXT: retq entry: %0 = load x86_mmx, ptr %a0