From 8d00fc8a05c693d39945cfa6392fc744acbaeb7b Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis@amd.com>
Date: Sat, 9 Nov 2024 08:18:46 -0600
Subject: [PATCH] Re-land x86_mmx IR type removal commits

cherry-pick:
dfeb3991fb48 jyknight@google.com Thu Jul 25 09:19:22 2024 -0400 Remove the `x86_mmx` IR type. (#98505)
b7e4fba6e5dc jyknight@google.com Sun Jul 28 18:12:47 2024 -0400 Cleanup x86_mmx after removing IR type  (#100646)

Change-Id: I987eda387fc403ab249f9d48eeb13fd66606343a
---
 clang/test/OpenMP/allow-kernelc-io.c          |    4 +-
 llvm/bindings/ocaml/llvm/llvm.mli             |    4 -
 llvm/bindings/ocaml/llvm/llvm_ocaml.c         |    5 -
 llvm/docs/BitCodeFormat.rst                   |    2 +-
 llvm/docs/LangRef.rst                         |   20 +-
 llvm/docs/ReleaseNotes.rst                    |  187 ++
 llvm/include/llvm-c/Core.h                    |   48 +-
 llvm/include/llvm/IR/DataLayout.h             |    1 -
 llvm/include/llvm/IR/Type.h                   |   12 +-
 llvm/lib/Analysis/ConstantFolding.cpp         |    8 +-
 llvm/lib/AsmParser/LLLexer.cpp                |    1 -
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |    4 +-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |    5 +-
 llvm/lib/CodeGen/ValueTypes.cpp               |    7 +-
 llvm/lib/IR/AsmWriter.cpp                     |    5 +-
 llvm/lib/IR/ConstantFold.cpp                  |    2 +-
 llvm/lib/IR/Core.cpp                          |    8 -
 llvm/lib/IR/DataLayout.cpp                    |    1 -
 llvm/lib/IR/Instructions.cpp                  |    9 -
 llvm/lib/IR/Intrinsics.cpp                    |   11 +-
 llvm/lib/IR/LLVMContextImpl.cpp               |    6 +-
 llvm/lib/IR/LLVMContextImpl.h                 |    2 +-
 llvm/lib/IR/Type.cpp                          |   15 +-
 .../DirectX/DXILWriter/DXILBitcodeWriter.cpp  |    3 -
 .../Hexagon/HexagonTargetObjectFile.cpp       |    1 -
 llvm/lib/Target/X86/X86CallingConv.td         |   34 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   92 +-
 .../Target/X86/X86InstCombineIntrinsic.cpp    |    8 +-
 llvm/lib/Target/X86/X86IntrinsicsInfo.h       |   96 +-
 .../IPO/DeadArgumentElimination.cpp           |    6 +-
 .../InstCombine/InstCombineCasts.cpp          |    7 -
 .../Instrumentation/MemorySanitizer.cpp       |   50 +-
 llvm/test/Assembler/x86mmx.ll                 |    9 -
 llvm/test/Bindings/llvm-c/echo.ll             |    2 +-
 llvm/test/Bitcode/bcanalyzer-types.ll         |    6 -
 llvm/test/Bitcode/compatibility-3.6.ll        |    2 +-
 llvm/test/Bitcode/compatibility-3.7.ll        |    2 +-
 llvm/test/Bitcode/compatibility-3.8.ll        |    2 +-
 llvm/test/Bitcode/compatibility-3.9.ll        |    2 +-
 llvm/test/Bitcode/compatibility-4.0.ll        |    2 +-
 llvm/test/Bitcode/compatibility-5.0.ll        |    2 +-
 llvm/test/Bitcode/compatibility-6.0.ll        |    2 +-
 llvm/test/Bitcode/compatibility.ll            |    2 -
 llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll  |    6 +-
 .../test/CodeGen/X86/2007-07-03-GR64ToVR64.ll |   10 +-
 .../CodeGen/X86/2008-04-08-CoalescerCrash.ll  |    8 +-
 .../CodeGen/X86/2008-08-23-64Bit-maskmovq.ll  |    8 +-
 .../CodeGen/X86/2008-09-05-sinttofp-2xi32.ll  |   35 +-
 .../CodeGen/X86/2011-06-14-mmx-inlineasm.ll   |    8 +-
 llvm/test/CodeGen/X86/avx-vbroadcast.ll       |   12 +-
 llvm/test/CodeGen/X86/avx2-vbroadcast.ll      |   15 +-
 llvm/test/CodeGen/X86/bitcast-mmx.ll          |   34 +-
 .../CodeGen/X86/expand-vr64-gr64-copy.mir     |    6 +-
 llvm/test/CodeGen/X86/fake-use-vector.ll      |    1 -
 llvm/test/CodeGen/X86/fast-isel-bc.ll         |   15 +-
 .../test/CodeGen/X86/fast-isel-nontemporal.ll |    8 +-
 .../CodeGen/X86/mmx-arg-passing-x86-64.ll     |   29 +-
 llvm/test/CodeGen/X86/mmx-arg-passing.ll      |   21 +-
 llvm/test/CodeGen/X86/mmx-arith.ll            |  380 +--
 llvm/test/CodeGen/X86/mmx-bitcast-fold.ll     |    8 +-
 llvm/test/CodeGen/X86/mmx-bitcast.ll          |   54 +-
 llvm/test/CodeGen/X86/mmx-build-vector.ll     |  134 +-
 llvm/test/CodeGen/X86/mmx-coalescing.ll       |   28 +-
 llvm/test/CodeGen/X86/mmx-cvt.ll              |  160 +-
 llvm/test/CodeGen/X86/mmx-fold-load.ll        |  275 +-
 llvm/test/CodeGen/X86/mmx-fold-zero.ll        |   52 +-
 llvm/test/CodeGen/X86/mmx-intrinsics.ll       | 1263 ++++----
 llvm/test/CodeGen/X86/mmx-only.ll             |   10 +-
 llvm/test/CodeGen/X86/mxcsr-reg-usage.ll      |   24 +-
 llvm/test/CodeGen/X86/nontemporal.ll          |    8 +-
 llvm/test/CodeGen/X86/pr13859.ll              |    5 +-
 llvm/test/CodeGen/X86/pr23246.ll              |    7 +-
 llvm/test/CodeGen/X86/pr29222.ll              |   10 +-
 llvm/test/CodeGen/X86/pr35982.ll              |    8 +-
 llvm/test/CodeGen/X86/select-mmx.ll           |   53 +-
 llvm/test/CodeGen/X86/stack-folding-mmx.ll    | 1288 +++++----
 llvm/test/CodeGen/X86/vec_extract-mmx.ll      |   43 +-
 llvm/test/CodeGen/X86/vec_insert-5.ll         |    4 +-
 llvm/test/CodeGen/X86/vec_insert-7.ll         |   11 +-
 llvm/test/CodeGen/X86/vec_insert-mmx.ll       |   14 +-
 llvm/test/CodeGen/X86/vector-shuffle-mmx.ll   |   30 +-
 llvm/test/CodeGen/X86/x86-64-psub.ll          |   70 +-
 .../MemorySanitizer/X86/mmx-intrinsics.ll     | 2543 +++++++++--------
 .../MemorySanitizer/i386/mmx-intrinsics.ll    |    1 -
 .../MemorySanitizer/vector_arith.ll           |   28 +-
 .../MemorySanitizer/vector_cvt.ll             |   16 +-
 .../MemorySanitizer/vector_pack.ll            |   23 +-
 .../MemorySanitizer/vector_shift.ll           |   20 +-
 .../X86/x86-GCC-inline-asm-Y-constraints.ll   |    2 +-
 .../Transforms/InstCombine/X86/x86-movmsk.ll  |   30 +-
 .../bitcast-vec-canon-inseltpoison.ll         |   45 -
 .../InstCombine/bitcast-vec-canon.ll          |   44 -
 llvm/test/Transforms/InstCombine/cast.ll      |   21 -
 .../ConstProp/gep-zeroinit-vector.ll          |   15 +-
 .../InstSimplify/ConstProp/loads.ll           |   13 -
 llvm/test/Transforms/LoopUnroll/X86/mmx.ll    |   35 -
 llvm/test/Transforms/SCCP/crash.ll            |    6 +-
 llvm/test/Transforms/SROA/pr57796.ll          |   14 +-
 llvm/test/Verifier/atomics.ll                 |   10 +-
 llvm/tools/llvm-c-test/echo.cpp               |    2 -
 llvm/tools/llvm-stress/llvm-stress.cpp        |    8 +-
 llvm/unittests/IR/InstructionsTest.cpp        |    9 +-
 mlir/docs/Dialects/LLVM.md                    |    2 -
 mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h  |    1 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |    1 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp |    2 -
 mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp       |    2 -
 mlir/lib/Target/LLVMIR/TypeToLLVM.cpp         |    3 -
 offload/hostexec/services/execute_service.cpp |    3 -
 revert_patches.txt                            |    5 -
 110 files changed, 3936 insertions(+), 3820 deletions(-)
 create mode 100644 llvm/docs/ReleaseNotes.rst
 delete mode 100644 llvm/test/Assembler/x86mmx.ll
 delete mode 100644 llvm/test/Transforms/LoopUnroll/X86/mmx.ll

diff --git a/clang/test/OpenMP/allow-kernelc-io.c b/clang/test/OpenMP/allow-kernelc-io.c
index fcdf6c2f575e02..934fb294a2cd73 100644
--- a/clang/test/OpenMP/allow-kernelc-io.c
+++ b/clang/test/OpenMP/allow-kernelc-io.c
@@ -48,7 +48,7 @@ int main(void) {
 // CHECK-NOPE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 1
 // CHECK-NOPE-NEXT:    store i32 1, ptr addrspace(1) [[TMP3]], align 4
 // CHECK-NOPE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 2
-// CHECK-NOPE-NEXT:    store i32 983041, ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NOPE-NEXT:    store i32 917505, ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NOPE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 3
 // CHECK-NOPE-NEXT:    store i32 11, ptr addrspace(1) [[TMP5]], align 4
 // CHECK-NOPE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i64 16
@@ -78,7 +78,7 @@ int main(void) {
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 1
 // CHECK-NEXT:    store i32 1, ptr addrspace(1) [[TMP3]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 2
-// CHECK-NEXT:    store i32 983041, ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT:    store i32 917505, ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[VARFN_ARGS_STORE]], ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i32 0, i32 3
 // CHECK-NEXT:    store i32 11, ptr addrspace(1) [[TMP5]], align 4
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[VARFN_ARGS_STORE_CASTED]], i64 16
diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index 25ce72e846d001..17cad1f43888be 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -766,10 +766,6 @@ val void_type : llcontext -> lltype
     [llvm::Type::LabelTy]. *)
 val label_type : llcontext -> lltype
 
-(** [x86_mmx_type c] returns the x86 64-bit MMX register type in the
-    context [c]. See [llvm::Type::X86_MMXTy]. *)
-val x86_mmx_type : llcontext -> lltype
-
 (** [type_by_name m name] returns the specified type from the current module
     if it exists.
     See the method [llvm::Module::getTypeByName] *)
diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 4ac824cd6a98a6..5906f427e69072 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -686,11 +686,6 @@ value llvm_label_type(value Context) {
   return to_val(LLVMLabelTypeInContext(Context_val(Context)));
 }
 
-/* llcontext -> lltype */
-value llvm_x86_mmx_type(value Context) {
-  return to_val(LLVMX86MMXTypeInContext(Context_val(Context)));
-}
-
 /* llmodule -> string -> lltype option */
 value llvm_type_by_name(value M, value Name) {
   return ptr_to_option(LLVMGetTypeByName(Module_val(M), String_val(Name)));
diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst
index 89933e3fc00502..8a26b101c4bf8e 100644
--- a/llvm/docs/BitCodeFormat.rst
+++ b/llvm/docs/BitCodeFormat.rst
@@ -1227,7 +1227,7 @@ TYPE_CODE_X86_MMX Record
 
 ``[X86_MMX]``
 
-The ``X86_MMX`` record (code 17) adds an ``x86_mmx`` type to the type table.
+The ``X86_MMX`` record (code 17) is deprecated, and imported as a <1 x i64> vector.
 
 TYPE_CODE_STRUCT_ANON Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index df1002c51a4e7b..8bcb22bf72cc47 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4060,24 +4060,6 @@ or constants of this type.
       x86_amx
 
 
-X86_mmx Type
-""""""""""""
-
-:Overview:
-
-The x86_mmx type represents a value held in an MMX register on an x86
-machine. The operations allowed on it are quite limited: parameters and
-return values, load and store, and bitcast. User-specified MMX
-instructions are represented as intrinsic or asm calls with arguments
-and/or results of this type. There are no arrays, vectors or constants
-of this type.
-
-:Syntax:
-
-::
-
-      x86_mmx
-
 
 .. _t_pointer:
 
@@ -4511,7 +4493,7 @@ represented by ``0xH`` followed by 4 hexadecimal digits. The bfloat 16-bit
 format is represented by ``0xR`` followed by 4 hexadecimal digits. All
 hexadecimal formats are big-endian (sign bit at the left).
 
-There are no constants of type x86_mmx and x86_amx.
+There are no constants of type x86_amx.
 
 .. _complexconstants:
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
new file mode 100644
index 00000000000000..551a9bec3b9161
--- /dev/null
+++ b/llvm/docs/ReleaseNotes.rst
@@ -0,0 +1,187 @@
+============================
+LLVM |release| Release Notes
+============================
+
+.. contents::
+    :local:
+
+.. only:: PreRelease
+
+  .. warning::
+     These are in-progress notes for the upcoming LLVM |version| release.
+     Release notes for previous releases can be found on
+     `the Download Page <https://releases.llvm.org/download.html>`_.
+
+
+Introduction
+============
+
+This document contains the release notes for the LLVM Compiler Infrastructure,
+release |release|.  Here we describe the status of LLVM, including major improvements
+from the previous release, improvements in various subprojects of LLVM, and
+some of the current users of the code.  All LLVM releases may be downloaded
+from the `LLVM releases web site <https://llvm.org/releases/>`_.
+
+For more information about LLVM, including information about the latest
+release, please check out the `main LLVM web site <https://llvm.org/>`_.  If you
+have questions or comments, the `Discourse forums
+<https://discourse.llvm.org>`_ is a good place to ask
+them.
+
+Note that if you are reading this file from a Git checkout or the main
+LLVM web page, this document applies to the *next* release, not the current
+one.  To see the release notes for a specific release, please see the `releases
+page <https://llvm.org/releases/>`_.
+
+Non-comprehensive list of changes in this release
+=================================================
+.. NOTE
+   For small 1-3 sentence descriptions, just add an entry at the end of
+   this list. If your description won't fit comfortably in one bullet
+   point (e.g. maybe you would like to give an example of the
+   functionality, or simply have a lot to talk about), see the `NOTE` below
+   for adding a new subsection.
+
+* ...
+
+Update on required toolchains to build LLVM
+-------------------------------------------
+
+Changes to the LLVM IR
+----------------------
+
+* The ``x86_mmx`` IR type has been removed. It will be translated to
+  the standard vector type ``<1 x i64>`` in bitcode upgrade.
+
+Changes to LLVM infrastructure
+------------------------------
+
+Changes to building LLVM
+------------------------
+
+Changes to TableGen
+-------------------
+
+Changes to Interprocedural Optimizations
+----------------------------------------
+
+Changes to the AArch64 Backend
+------------------------------
+
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
+Changes to the AMDGPU Backend
+-----------------------------
+
+Changes to the ARM Backend
+--------------------------
+
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
+Changes to the AVR Backend
+--------------------------
+
+Changes to the DirectX Backend
+------------------------------
+
+Changes to the Hexagon Backend
+------------------------------
+
+Changes to the LoongArch Backend
+--------------------------------
+
+Changes to the MIPS Backend
+---------------------------
+
+Changes to the PowerPC Backend
+------------------------------
+
+Changes to the RISC-V Backend
+-----------------------------
+
+* `.balign N, 0`, `.p2align N, 0`, `.align N, 0` in code sections will now fill
+  the required alignment space with a sequence of `0x0` bytes (the requested
+  fill value) rather than NOPs.
+
+Changes to the WebAssembly Backend
+----------------------------------
+
+Changes to the Windows Target
+-----------------------------
+
+Changes to the X86 Backend
+--------------------------
+
+* `.balign N, 0x90`, `.p2align N, 0x90`, and `.align N, 0x90` in code sections
+  now fill the required alignment space with repeating `0x90` bytes, rather than
+  using optimised NOP filling. Optimised NOP filling fills the space with NOP
+  instructions of various widths, not just those that use the `0x90` byte
+  encoding. To use optimised NOP filling in a code section, leave off the
+  "fillval" argument, i.e. `.balign N`, `.p2align N` or `.align N` respectively.
+
+* Due to the removal of the ``x86_mmx`` IR type, functions with
+  ``x86_mmx`` arguments or return values will use a different,
+  incompatible, calling convention ABI. Such functions are not
+  generally seen in the wild (Clang never generates them!), so this is
+  not expected to result in real-world compatibility problems.
+
+Changes to the OCaml bindings
+-----------------------------
+
+Changes to the Python bindings
+------------------------------
+
+Changes to the C API
+--------------------
+
+* The following symbols are deleted due to the removal of the ``x86_mmx`` IR type:
+
+  * ``LLVMX86_MMXTypeKind``
+  * ``LLVMX86MMXTypeInContext``
+  * ``LLVMX86MMXType``
+
+Changes to the CodeGen infrastructure
+-------------------------------------
+
+Changes to the Metadata Info
+---------------------------------
+
+Changes to the Debug Info
+---------------------------------
+
+Changes to the LLVM tools
+---------------------------------
+
+Changes to LLDB
+---------------------------------
+
+Changes to BOLT
+---------------------------------
+
+Changes to Sanitizers
+---------------------
+
+Other Changes
+-------------
+
+External Open Source Projects Using LLVM 19
+===========================================
+
+* A project...
+
+Additional Information
+======================
+
+A wide variety of additional information is available on the `LLVM web page
+<https://llvm.org/>`_, in particular in the `documentation
+<https://llvm.org/docs/>`_ section.  The web page also contains versions of the
+API documentation which is up-to-date with the Git version of the source
+code.  You can access versions of these documents specific to this release by
+going into the ``llvm/docs/`` directory in the LLVM tree.
+
+If you have any questions or comments about LLVM, please feel free to contact
+us via the `Discourse forums <https://discourse.llvm.org>`_.
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 978ecb0b686676..dc8ecf4fb2ade2 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -146,27 +146,27 @@ typedef enum {
 } LLVMOpcode;
 
 typedef enum {
-  LLVMVoidTypeKind,      /**< type with no size */
-  LLVMHalfTypeKind,      /**< 16 bit floating point type */
-  LLVMFloatTypeKind,     /**< 32 bit floating point type */
-  LLVMDoubleTypeKind,    /**< 64 bit floating point type */
-  LLVMX86_FP80TypeKind,  /**< 80 bit floating point type (X87) */
-  LLVMFP128TypeKind,     /**< 128 bit floating point type (112-bit mantissa)*/
-  LLVMPPC_FP128TypeKind, /**< 128 bit floating point type (two 64-bits) */
-  LLVMLabelTypeKind,     /**< Labels */
-  LLVMIntegerTypeKind,   /**< Arbitrary bit width integers */
-  LLVMFunctionTypeKind,  /**< Functions */
-  LLVMStructTypeKind,    /**< Structures */
-  LLVMArrayTypeKind,     /**< Arrays */
-  LLVMPointerTypeKind,   /**< Pointers */
-  LLVMVectorTypeKind,    /**< Fixed width SIMD vector type */
-  LLVMMetadataTypeKind,  /**< Metadata */
-  LLVMX86_MMXTypeKind,   /**< X86 MMX */
-  LLVMTokenTypeKind,     /**< Tokens */
-  LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */
-  LLVMBFloatTypeKind,    /**< 16 bit brain floating point type */
-  LLVMX86_AMXTypeKind,   /**< X86 AMX */
-  LLVMTargetExtTypeKind, /**< Target extension type */
+  LLVMVoidTypeKind = 0,     /**< type with no size */
+  LLVMHalfTypeKind = 1,     /**< 16 bit floating point type */
+  LLVMFloatTypeKind = 2,    /**< 32 bit floating point type */
+  LLVMDoubleTypeKind = 3,   /**< 64 bit floating point type */
+  LLVMX86_FP80TypeKind = 4, /**< 80 bit floating point type (X87) */
+  LLVMFP128TypeKind = 5, /**< 128 bit floating point type (112-bit mantissa)*/
+  LLVMPPC_FP128TypeKind = 6, /**< 128 bit floating point type (two 64-bits) */
+  LLVMLabelTypeKind = 7,     /**< Labels */
+  LLVMIntegerTypeKind = 8,   /**< Arbitrary bit width integers */
+  LLVMFunctionTypeKind = 9,  /**< Functions */
+  LLVMStructTypeKind = 10,   /**< Structures */
+  LLVMArrayTypeKind = 11,    /**< Arrays */
+  LLVMPointerTypeKind = 12,  /**< Pointers */
+  LLVMVectorTypeKind = 13,   /**< Fixed width SIMD vector type */
+  LLVMMetadataTypeKind = 14, /**< Metadata */
+                             /* 15 previously used by LLVMX86_MMXTypeKind */
+  LLVMTokenTypeKind = 16,    /**< Tokens */
+  LLVMScalableVectorTypeKind = 17, /**< Scalable SIMD vector type */
+  LLVMBFloatTypeKind = 18,         /**< 16 bit brain floating point type */
+  LLVMX86_AMXTypeKind = 19,        /**< X86 AMX */
+  LLVMTargetExtTypeKind = 20,      /**< Target extension type */
 } LLVMTypeKind;
 
 typedef enum {
@@ -1734,11 +1734,6 @@ LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C);
  */
 LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C);
 
-/**
- * Create a X86 MMX type in a context.
- */
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C);
-
 /**
  * Create a X86 AMX type in a context.
  */
@@ -1760,7 +1755,6 @@ LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C);
  */
 LLVMTypeRef LLVMVoidType(void);
 LLVMTypeRef LLVMLabelType(void);
-LLVMTypeRef LLVMX86MMXType(void);
 LLVMTypeRef LLVMX86AMXType(void);
 
 /**
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 829249a6fbe38b..93bd519f5727d8 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -646,7 +646,6 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::FloatTyID:
     return TypeSize::getFixed(32);
   case Type::DoubleTyID:
-  case Type::X86_MMXTyID:
     return TypeSize::getFixed(64);
   case Type::PPC_FP128TyID:
   case Type::FP128TyID:
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index f03684372d3877..7fa940ab347af6 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -63,7 +63,6 @@ class Type {
     VoidTyID,      ///< type with no size
     LabelTyID,     ///< Labels
     MetadataTyID,  ///< Metadata
-    X86_MMXTyID,   ///< MMX vectors (64 bits, X86 specific)
     X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
     TokenTyID,     ///< Tokens
 
@@ -197,9 +196,6 @@ class Type {
 
   const fltSemantics &getFltSemantics() const;
 
-  /// Return true if this is X86 MMX.
-  bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
-
   /// Return true if this is X86 AMX.
   bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; }
 
@@ -285,8 +281,8 @@ class Type {
   /// Return true if the type is a valid type for a register in codegen. This
   /// includes all first-class types except struct and array types.
   bool isSingleValueType() const {
-    return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() ||
-           isPointerTy() || isVectorTy() || isX86_AMXTy() || isTargetExtTy();
+    return isFloatingPointTy() || isIntegerTy() || isPointerTy() ||
+           isVectorTy() || isX86_AMXTy() || isTargetExtTy();
   }
 
   /// Return true if the type is an aggregate type. This means it is valid as
@@ -302,8 +298,7 @@ class Type {
   bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
     // If it's a primitive, it is always sized.
     if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
-        getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID ||
-        getTypeID() == X86_AMXTyID)
+        getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID)
       return true;
     // If it is not something that can have a size (e.g. a function or label),
     // it doesn't have a size.
@@ -445,7 +440,6 @@ class Type {
   static Type *getX86_FP80Ty(LLVMContext &C);
   static Type *getFP128Ty(LLVMContext &C);
   static Type *getPPC_FP128Ty(LLVMContext &C);
-  static Type *getX86_MMXTy(LLVMContext &C);
   static Type *getX86_AMXTy(LLVMContext &C);
   static Type *getTokenTy(LLVMContext &C);
   static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 5f6df3fa06592e..88db315ffd0bcb 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -565,16 +565,14 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
     Type *MapTy = Type::getIntNTy(C->getContext(),
                                   DL.getTypeSizeInBits(LoadTy).getFixedValue());
     if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) {
-      if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
-          !LoadTy->isX86_AMXTy())
+      if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
         // Materializing a zero can be done trivially without a bitcast
         return Constant::getNullValue(LoadTy);
       Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy;
       Res = FoldBitCast(Res, CastTy, DL);
       if (LoadTy->isPtrOrPtrVectorTy()) {
         // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr
-        if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
-            !LoadTy->isX86_AMXTy())
+        if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
           return Constant::getNullValue(LoadTy);
         if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
           // Be careful not to replace a load of an addrspace value with an inttoptr here
@@ -765,7 +763,7 @@ Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty,
   // uniform.
   if (!DL.typeSizeEqualsStoreSize(C->getType()))
     return nullptr;
-  if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy())
+  if (C->isNullValue() && !Ty->isX86_AMXTy())
     return Constant::getNullValue(Ty);
   if (C->isAllOnesValue() &&
       (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy()))
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 65343c9630384d..ae3c89ab1acc36 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -881,7 +881,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context));
   TYPEKEYWORD("label",     Type::getLabelTy(Context));
   TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
-  TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
   TYPEKEYWORD("x86_amx",   Type::getX86_AMXTy(Context));
   TYPEKEYWORD("token",     Type::getTokenTy(Context));
   TYPEKEYWORD("ptr",       PointerType::getUnqual(Context));
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index f98d4751d3f2ea..78fb92dc44494f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2528,7 +2528,9 @@ Error BitcodeReader::parseTypeTableBody() {
       ResultTy = Type::getMetadataTy(Context);
       break;
     case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
-      ResultTy = Type::getX86_MMXTy(Context);
+      // Deprecated: decodes as <1 x i64>
+      ResultTy =
+          llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
       break;
     case bitc::TYPE_CODE_X86_AMX:   // X86_AMX
       ResultTy = Type::getX86_AMXTy(Context);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 9bda3d282a5bb7..1f62bc8ecfa106 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1117,8 +1117,9 @@ void ModuleBitcodeWriter::writeTypeTable() {
     case Type::FP128TyID:     Code = bitc::TYPE_CODE_FP128;     break;
     case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
     case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
-    case Type::MetadataTyID:  Code = bitc::TYPE_CODE_METADATA;  break;
-    case Type::X86_MMXTyID:   Code = bitc::TYPE_CODE_X86_MMX;   break;
+    case Type::MetadataTyID:
+      Code = bitc::TYPE_CODE_METADATA;
+      break;
     case Type::X86_AMXTyID:   Code = bitc::TYPE_CODE_X86_AMX;   break;
     case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
     case Type::IntegerTyID:
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index b10c58babf93fc..e3c746b274dde1 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -214,7 +214,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     assert(isExtended() && "Type is not extended!");
     return LLVMTy;
   case MVT::isVoid:  return Type::getVoidTy(Context);
-  case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::x86mmx:  return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
   case MVT::aarch64svcount:
     return TargetExtType::get(Context, "aarch64.svcount");
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
@@ -248,8 +248,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::BFloatTyID:    return MVT(MVT::bf16);
   case Type::FloatTyID:     return MVT(MVT::f32);
   case Type::DoubleTyID:    return MVT(MVT::f64);
-  case Type::X86_FP80TyID:  return MVT(MVT::f80);
-  case Type::X86_MMXTyID:   return MVT(MVT::x86mmx);
+  case Type::X86_FP80TyID:
+    return MVT(MVT::f80);
   case Type::TargetExtTyID: {
     TargetExtType *TargetExtTy = cast<TargetExtType>(Ty);
     if (TargetExtTy->getName() == "aarch64.svcount")
@@ -334,4 +334,3 @@ void MVT::print(raw_ostream &OS) const {
   else
     OS << EVT(*this).getEVTString();
 }
-
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ad65e33119b85f..da4b9ec87a6f46 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -572,8 +572,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   case Type::FP128TyID:     OS << "fp128"; return;
   case Type::PPC_FP128TyID: OS << "ppc_fp128"; return;
   case Type::LabelTyID:     OS << "label"; return;
-  case Type::MetadataTyID:  OS << "metadata"; return;
-  case Type::X86_MMXTyID:   OS << "x86_mmx"; return;
+  case Type::MetadataTyID:
+    OS << "metadata";
+    return;
   case Type::X86_AMXTyID:   OS << "x86_amx"; return;
   case Type::TokenTyID:     OS << "token"; return;
   case Type::IntegerTyID:
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d3566c7ddf66b9..cfe87937c372cd 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -142,7 +142,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     return UndefValue::get(DestTy);
   }
 
-  if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() &&
+  if (V->isNullValue() && !DestTy->isX86_AMXTy() &&
       opc != Instruction::AddrSpaceCast)
     return Constant::getNullValue(DestTy);
 
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index e2b91f3781f03b..b497d7cc829d3a 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -617,8 +617,6 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMPointerTypeKind;
   case Type::FixedVectorTyID:
     return LLVMVectorTypeKind;
-  case Type::X86_MMXTyID:
-    return LLVMX86_MMXTypeKind;
   case Type::X86_AMXTyID:
     return LLVMX86_AMXTypeKind;
   case Type::TokenTyID:
@@ -733,9 +731,6 @@ LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) {
 LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C));
 }
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
-  return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
-}
 LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C));
 }
@@ -761,9 +756,6 @@ LLVMTypeRef LLVMFP128Type(void) {
 LLVMTypeRef LLVMPPCFP128Type(void) {
   return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext());
 }
-LLVMTypeRef LLVMX86MMXType(void) {
-  return LLVMX86MMXTypeInContext(LLVMGetGlobalContext());
-}
 LLVMTypeRef LLVMX86AMXType(void) {
   return LLVMX86AMXTypeInContext(LLVMGetGlobalContext());
 }
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 7c405a18abbc56..a4af0ead07cf61 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -823,7 +823,6 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     // layout.
     return Align(PowerOf2Ceil(BitWidth / 8));
   }
-  case Type::X86_MMXTyID:
   case Type::FixedVectorTyID:
   case Type::ScalableVectorTyID: {
     unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinValue();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 19f51b06ab184a..05e340ffa20a07 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3107,9 +3107,6 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
   if (SrcBits != DestBits)
     return false;
 
-  if (DestTy->isX86_MMXTy() || SrcTy->isX86_MMXTy())
-    return false;
-
   return true;
 }
 
@@ -3219,12 +3216,6 @@ CastInst::getCastOpcode(
       return IntToPtr;                              // int -> ptr
     }
     llvm_unreachable("Casting pointer to other than pointer or int");
-  } else if (DestTy->isX86_MMXTy()) {
-    if (SrcTy->isVectorTy()) {
-      assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
-      return BitCast;                               // 64-bit vector to MMX
-    }
-    llvm_unreachable("Illegal cast to X86_MMX");
   }
   llvm_unreachable("Casting to type that is not first-class");
 }
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 46df6845ff3ab7..3130a0bd2955a5 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -140,9 +140,6 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
     case Type::PPC_FP128TyID:
       Result += "ppcf128";
       break;
-    case Type::X86_MMXTyID:
-      Result += "x86mmx";
-      break;
     case Type::X86_AMXTyID:
       Result += "x86amx";
       break;
@@ -497,7 +494,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::VarArg:
     return Type::getVoidTy(Context);
   case IITDescriptor::MMX:
-    return Type::getX86_MMXTy(Context);
+    return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
   case IITDescriptor::AMX:
     return Type::getX86_AMXTy(Context);
   case IITDescriptor::Token:
@@ -797,7 +794,11 @@ matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
     return !Ty->isVoidTy();
   case IITDescriptor::VarArg:
     return true;
-  case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+  case IITDescriptor::MMX: {
+    FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+    return !VT || VT->getNumElements() != 1 ||
+           !VT->getElementType()->isIntegerTy(64);
+  }
   case IITDescriptor::AMX:
     return !Ty->isX86_AMXTy();
   case IITDescriptor::Token:
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 01024d27444909..f42c15c8f749a5 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -37,9 +37,9 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
       FloatTy(C, Type::FloatTyID), DoubleTy(C, Type::DoubleTyID),
       MetadataTy(C, Type::MetadataTyID), TokenTy(C, Type::TokenTyID),
       X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID),
-      PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID),
-      X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8),
-      Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {}
+      PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_AMXTy(C, Type::X86_AMXTyID),
+      Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32),
+      Int64Ty(C, 64), Int128Ty(C, 128) {}
 
 LLVMContextImpl::~LLVMContextImpl() {
 #ifndef NDEBUG
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 0936611cab1d83..4ce42904a69d46 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1617,7 +1617,7 @@ class LLVMContextImpl {
   // Basic type instances.
   Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
       TokenTy;
-  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy;
+  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_AMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
 
   std::unique_ptr<ConstantTokenNone> TheNoneToken;
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index cb91b973aabc73..151f36c91e0d93 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -47,8 +47,8 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   case FP128TyID     : return getFP128Ty(C);
   case PPC_FP128TyID : return getPPC_FP128Ty(C);
   case LabelTyID     : return getLabelTy(C);
-  case MetadataTyID  : return getMetadataTy(C);
-  case X86_MMXTyID   : return getX86_MMXTy(C);
+  case MetadataTyID:
+    return getMetadataTy(C);
   case X86_AMXTyID   : return getX86_AMXTy(C);
   case TokenTyID     : return getTokenTy(C);
   default:
@@ -138,14 +138,6 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   if (isa<VectorType>(this) && isa<VectorType>(Ty))
     return getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits();
 
-  //  64-bit fixed width vector types can be losslessly converted to x86mmx.
-  if (((isa<FixedVectorType>(this)) && Ty->isX86_MMXTy()) &&
-      getPrimitiveSizeInBits().getFixedValue() == 64)
-    return true;
-  if ((isX86_MMXTy() && isa<FixedVectorType>(Ty)) &&
-      Ty->getPrimitiveSizeInBits().getFixedValue() == 64)
-    return true;
-
   //  8192-bit fixed width vector types can be losslessly converted to x86amx.
   if (((isa<FixedVectorType>(this)) && Ty->isX86_AMXTy()) &&
       getPrimitiveSizeInBits().getFixedValue() == 8192)
@@ -192,8 +184,6 @@ TypeSize Type::getPrimitiveSizeInBits() const {
     return TypeSize::getFixed(128);
   case Type::PPC_FP128TyID:
     return TypeSize::getFixed(128);
-  case Type::X86_MMXTyID:
-    return TypeSize::getFixed(64);
   case Type::X86_AMXTyID:
     return TypeSize::getFixed(8192);
   case Type::IntegerTyID:
@@ -258,7 +248,6 @@ Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; }
 Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; }
 Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
 Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
-Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; }
 Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; }
 
 IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 0be978e41350c1..45aadac861946b 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1048,9 +1048,6 @@ void DXILBitcodeWriter::writeTypeTable() {
     case Type::MetadataTyID:
       Code = bitc::TYPE_CODE_METADATA;
       break;
-    case Type::X86_MMXTyID:
-      Code = bitc::TYPE_CODE_X86_MMX;
-      break;
     case Type::IntegerTyID:
       // INTEGER: [width]
       Code = bitc::TYPE_CODE_INTEGER;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index e5d10a75728bf8..0c1b0aea41f41f 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -329,7 +329,6 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   case Type::PPC_FP128TyID:
   case Type::LabelTyID:
   case Type::MetadataTyID:
-  case Type::X86_MMXTyID:
   case Type::X86_AMXTyID:
   case Type::TokenTyID:
   case Type::TypedPointerTyID:
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index ddbc267c151157..91af111db8cda5 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -168,10 +168,6 @@ def CC_#NAME : CallingConv<[
     CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
     CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
 
-    // MMX type gets 8 byte slot in stack , while alignment depends on target
-    CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
-    CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
     // float 128 get stack slots whose size and alignment depends 
     // on the subtarget.
     CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
@@ -286,10 +282,6 @@ def RetCC_X86Common : CallingConv<[
   CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
             CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
 
-  // MMX vector types are always returned in MM0. If the target doesn't have
-  // MM0, it doesn't support these vector types.
-  CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
-
   // Long double types are always returned in FP0 (even with SSE),
   // except on Win64.
   CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>>
@@ -376,9 +368,6 @@ def RetCC_X86_64_C : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
 
-  // MMX vector types are always returned in XMM0.
-  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
-
   // Pointers are always returned in full 64-bit registers.
   CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
 
@@ -389,9 +378,6 @@ def RetCC_X86_64_C : CallingConv<[
 
 // X86-Win64 C return-value convention.
 def RetCC_X86_Win64_C : CallingConv<[
-  // The X86-Win64 calling convention always returns __m64 values in RAX.
-  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
   // GCC returns FP values in RAX on Win64.
   CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
   CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
@@ -436,8 +422,6 @@ def RetCC_X86_64_Swift : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
 
-  // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
-  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -572,12 +556,6 @@ def CC_X86_64_C : CallingConv<[
 
   CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
 
-  // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
-  CCIfType<[x86mmx],
-            CCIfSubtarget<"isTargetDarwin()",
-            CCIfSubtarget<"hasSSE2()",
-            CCPromoteToType<v2i64>>>>,
-
   // Boolean vectors of AVX-512 are passed in SIMD registers.
   // The call from AVX to AVX-512 function should work,
   // since the boolean types in AVX/AVX2 are promoted by default.
@@ -666,9 +644,6 @@ def CC_X86_Win64_C : CallingConv<[
   // Long doubles are passed by pointer
   CCIfType<[f80], CCPassIndirect<i64>>,
 
-  // The first 4 MMX vector arguments are passed in GPRs.
-  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
   // If SSE was disabled, pass FP values smaller than 64-bits as integers in
   // GPRs or on the stack.
   CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
@@ -843,11 +818,6 @@ def CC_X86_32_Common : CallingConv<[
 
   CCIfNotVarArg<CCIfInReg<CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
 
-  // The first 3 __m64 vector arguments are passed in mmx registers if the
-  // call is not a vararg call.
-  CCIfNotVarArg<CCIfType<[x86mmx],
-                CCAssignToReg<[MM0, MM1, MM2]>>>,
-
   CCIfType<[f16], CCAssignToStack<4, 4>>,
 
   // Integer/Float values get stored in stack slots that are 4 bytes in
@@ -870,10 +840,6 @@ def CC_X86_32_Common : CallingConv<[
   CCIfType<[v32i1], CCPromoteToType<v32i8>>,
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
-  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
-  // passed in the parameter area.
-  CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
   // Darwin passes vectors in a form that differs from the i386 psABI
   CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 713f4fb7d4cc14..91e48f1e77db12 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2649,7 +2649,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::FP_EXTEND,
                        ISD::STRICT_FP_EXTEND,
                        ISD::FP_ROUND,
-                       ISD::STRICT_FP_ROUND});
+                       ISD::STRICT_FP_ROUND,
+                       ISD::INTRINSIC_VOID,
+                       ISD::INTRINSIC_WO_CHAIN,
+                       ISD::INTRINSIC_W_CHAIN});
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
@@ -27652,6 +27655,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       llvm_unreachable("Unsupported truncstore intrinsic");
     }
   }
+  case INTR_TYPE_CAST_MMX:
+    return SDValue(); // handled in combineINTRINSIC_*
   }
 }
 
@@ -58616,6 +58621,86 @@ static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
+// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
+// use x86mmx instead.
+static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG) {
+  SDLoc dl(N);
+
+  bool MadeChange = false, CastReturnVal = false;
+  SmallVector<SDValue, 8> Args;
+  for (const SDValue &Arg : N->op_values()) {
+    if (Arg.getValueType() == MVT::v1i64) {
+      MadeChange = true;
+      Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
+    } else
+      Args.push_back(Arg);
+  }
+  SDVTList VTs = N->getVTList();
+  SDVTList NewVTs = VTs;
+  if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
+    SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
+    NewVTArr[0] = MVT::x86mmx;
+    NewVTs = DAG.getVTList(NewVTArr);
+    MadeChange = true;
+    CastReturnVal = true;
+  }
+
+  if (MadeChange) {
+    SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
+    if (CastReturnVal) {
+      SmallVector<SDValue, 2> Returns;
+      for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
+        Returns.push_back(Result.getValue(i));
+      Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
+      return DAG.getMergeValues(Returns, dl);
+    }
+    return Result;
+  }
+  return SDValue();
+}
+static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(0);
+  const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
+static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(1);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
+static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  unsigned IntNo = N->getConstantOperandVal(1);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+  if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+    return FixupMMXIntrinsicTypes(N, DAG);
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -58806,7 +58891,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
-  // clang-format on
+  case ISD::INTRINSIC_WO_CHAIN:  return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
+  case ISD::INTRINSIC_W_CHAIN:  return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
+  case ISD::INTRINSIC_VOID:  return combineINTRINSIC_VOID(N, DAG, DCI);
+    // clang-format on
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 6f51d4141c959c..7c9738bf082164 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -623,11 +623,13 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
   if (isa<UndefValue>(Arg))
     return Constant::getNullValue(ResTy);
 
-  auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
-  // We can't easily peek through x86_mmx types.
-  if (!ArgTy)
+  // Preserve previous behavior and give up.
+  // TODO: treat as <8 x i8>.
+  if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
     return nullptr;
 
+  auto *ArgTy = cast<FixedVectorType>(Arg->getType());
+
   // Expand MOVMSK to compare/bitcast/zext:
   // e.g. PMOVMSKB(v16i8 x):
   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index f9abb16767c91b..86fd04046d16a0 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -74,7 +74,8 @@ enum IntrinsicType : uint16_t {
   GATHER_AVX2,
   ROUNDP,
   ROUNDS,
-  RDPRU
+  RDPRU,
+  INTR_TYPE_CAST_MMX
 };
 
 struct IntrinsicData {
@@ -324,6 +325,8 @@ static const IntrinsicData IntrinsicsWithChain[] = {
     X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0),
     X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
     X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
+    X86_INTRINSIC_DATA(mmx_maskmovq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_movnt_dq, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
     X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0),
     X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
@@ -2019,6 +2022,75 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
     X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB,
                        0),
+
+    X86_INTRINSIC_DATA(mmx_packssdw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_packsswb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_packuswb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padd_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padds_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_padds_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_paddus_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_paddus_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_palignr_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pand, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pandn, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pavg_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pavg_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpeq_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pcmpgt_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pextr_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pinsr_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmadd_wd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmaxs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmaxu_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmins_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pminu_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmovmskb, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulh_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulhu_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmull_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pmulu_dq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_por, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psad_bw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psll_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pslli_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psra_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psra_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrai_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrai_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrl_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psrli_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_q, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psub_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubs_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubus_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_psubus_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhbw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhdq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckhwd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpcklbw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpckldq, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_punpcklwd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(mmx_pxor, INTR_TYPE_CAST_MMX, 0, 0),
+
     X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
     X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
     X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
@@ -2027,8 +2099,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
     X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
     X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
+    X86_INTRINSIC_DATA(sse_cvtpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtpi2pd, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtpi2ps, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvtps2pi, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_cvtss2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
     X86_INTRINSIC_DATA(sse_cvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+    X86_INTRINSIC_DATA(sse_cvttpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(sse_cvttps2pi, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
     X86_INTRINSIC_DATA(sse_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
     X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
@@ -2036,6 +2114,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
     X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0),
     X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+    X86_INTRINSIC_DATA(sse_pshuf_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
     X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
     X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
@@ -2118,14 +2197,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
     X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
     X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_pabs_w, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+    X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW,
                        0),
+    X86_INTRINSIC_DATA(ssse3_pmul_hr_sw, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+    X86_INTRINSIC_DATA(ssse3_pshuf_b, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_b, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_d, INTR_TYPE_CAST_MMX, 0, 0),
+    X86_INTRINSIC_DATA(ssse3_psign_w, INTR_TYPE_CAST_MMX, 0, 0),
     X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
     X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
     X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 005db42f609766..ed93b4491c50e4 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -977,8 +977,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
       } else if (NewCB->getType()->isVoidTy()) {
         // If the return value is dead, replace any uses of it with poison
         // (any non-debug value uses will get removed later on).
-        if (!CB.getType()->isX86_MMXTy())
-          CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
+        CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
       } else {
         assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
                "Return type changed, but not into a void. The old return type"
@@ -1042,8 +1041,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
     } else {
       // If this argument is dead, replace any uses of it with poison
       // (any non-debug value uses will get removed later on).
-      if (!I->getType()->isX86_MMXTy())
-        I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
     }
 
   // If we change the return value of the function we must rewrite any return
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index e98c0a7665237a..c1417afee6b3b3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2750,13 +2750,6 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
     return replaceInstUsesWith(CI, Src);
 
   if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
-    // Beware: messing with this target-specific oddity may cause trouble.
-    if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
-      Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
-      return InsertElementInst::Create(PoisonValue::get(DestTy), Elem,
-                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
-    }
-
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 11435e25183cb7..624ea4cdd814f5 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2967,8 +2967,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Caller guarantees that this intrinsic does not access memory.
   bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
     Type *RetTy = I.getType();
-    if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy() ||
-          RetTy->isX86_MMXTy()))
+    if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy()))
       return false;
 
     unsigned NumArgOperands = I.arg_size();
@@ -3197,7 +3196,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Get an X86_MMX-sized vector type.
+  // Get an MMX-sized vector type.
   Type *getMMXVectorTy(unsigned EltSizeInBits) {
     const unsigned X86_MMXSizeInBits = 64;
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
@@ -3243,20 +3242,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   // packs elements of 2 input vectors into half as many bits with saturation.
   // Shadow is propagated with the signed variant of the same intrinsic applied
   // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
-  // EltSizeInBits is used only for x86mmx arguments.
-  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+  // MMXEltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I,
+                                 unsigned MMXEltSizeInBits = 0) {
     assert(I.arg_size() == 2);
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
     IRBuilder<> IRB(&I);
     Value *S1 = getShadow(&I, 0);
     Value *S2 = getShadow(&I, 1);
-    assert(isX86_MMX || S1->getType()->isVectorTy());
+    assert(S1->getType()->isVectorTy());
 
     // SExt and ICmpNE below must apply to individual elements of input vectors.
     // In case of x86mmx arguments, cast them to appropriate vector types and
     // back.
-    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
-    if (isX86_MMX) {
+    Type *T =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits) : S1->getType();
+    if (MMXEltSizeInBits) {
       S1 = IRB.CreateBitCast(S1, T);
       S2 = IRB.CreateBitCast(S2, T);
     }
@@ -3264,16 +3264,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         IRB.CreateSExt(IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
     Value *S2_ext =
         IRB.CreateSExt(IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
-    if (isX86_MMX) {
-      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
-      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
-      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    if (MMXEltSizeInBits) {
+      S1_ext = IRB.CreateBitCast(S1_ext, getMMXVectorTy(64));
+      S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64));
     }
 
-    Value *S = IRB.CreateIntrinsic(getSignedPackIntrinsic(I.getIntrinsicID()),
-                                   {}, {S1_ext, S2_ext}, /*FMFSource=*/nullptr,
-                                   "_msprop_vector_pack");
-    if (isX86_MMX)
+    Function *ShadowFn = Intrinsic::getDeclaration(
+        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+    Value *S =
+        IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
+    if (MMXEltSizeInBits)
       S = IRB.CreateBitCast(S, getShadowTy(&I));
     setShadow(&I, S);
     setOriginForNaryOp(I);
@@ -3380,10 +3381,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   // Instrument sum-of-absolute-differences intrinsic.
-  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+  void handleVectorSadIntrinsic(IntrinsicInst &I, bool IsMMX = false) {
     const unsigned SignificantBitsPerResultElement = 16;
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
-    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    Type *ResTy = IsMMX ? IntegerType::get(*MS.C, 64) : I.getType();
     unsigned ZeroBitsPerResultElement =
         ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
 
@@ -3402,9 +3402,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // Instrument multiply-add intrinsic.
   void handleVectorPmaddIntrinsic(IntrinsicInst &I,
-                                  unsigned EltSizeInBits = 0) {
-    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
-    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+                                  unsigned MMXEltSizeInBits = 0) {
+    Type *ResTy =
+        MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
     IRBuilder<> IRB(&I);
     auto *Shadow0 = getShadow(&I, 0);
     auto *Shadow1 = getShadow(&I, 1);
@@ -4236,6 +4236,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
 
     case Intrinsic::x86_mmx_psad_bw:
+      handleVectorSadIntrinsic(I, true);
+      break;
     case Intrinsic::x86_sse2_psad_bw:
     case Intrinsic::x86_avx2_psad_bw:
       handleVectorSadIntrinsic(I);
@@ -5152,7 +5154,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
     Type *T = arg->getType();
     if (T->isX86_FP80Ty())
       return AK_Memory;
-    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+    if (T->isFPOrFPVectorTy())
       return AK_FloatingPoint;
     if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
       return AK_GeneralPurpose;
diff --git a/llvm/test/Assembler/x86mmx.ll b/llvm/test/Assembler/x86mmx.ll
deleted file mode 100644
index 608347e0fceb10..00000000000000
--- a/llvm/test/Assembler/x86mmx.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | FileCheck %s
-; RUN: verify-uselistorder %s
-; Basic smoke test for x86_mmx type.
-
-; CHECK: define x86_mmx @sh16
-define x86_mmx  @sh16(x86_mmx %A) {
-; CHECK: ret x86_mmx %A
-        ret x86_mmx %A
-}
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index 12dd66957fff4b..c4b932034b501a 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -70,7 +70,7 @@ define void @types() {
   %9 = alloca [3 x i22], align 4
   %10 = alloca ptr addrspace(5), align 8
   %11 = alloca <5 x ptr>, align 64
-  %12 = alloca x86_mmx, align 8
+  %12 = alloca <1 x i64>, align 8
   ret void
 }
 
diff --git a/llvm/test/Bitcode/bcanalyzer-types.ll b/llvm/test/Bitcode/bcanalyzer-types.ll
index cbe6f5d22c9479..f1732db174c295 100644
--- a/llvm/test/Bitcode/bcanalyzer-types.ll
+++ b/llvm/test/Bitcode/bcanalyzer-types.ll
@@ -3,7 +3,6 @@
 ; CHECK: Block ID {{.*}} (TYPE_BLOCK_ID)
 ; CHECK: BFLOAT
 ; CHECK: TOKEN
-; CHECK: X86_MMX
 ; CHECK: HALF
 ; CHECK: Block ID
 
@@ -12,11 +11,6 @@ define half @test_half(half %x, half %y) {
   ret half %a
 }
 
-define x86_mmx @test_mmx(<2 x i32> %x) {
-  %a = bitcast <2 x i32> %x to x86_mmx
-  ret x86_mmx %a
-}
-
 define bfloat @test_bfloat(i16 %x) {
   %a = bitcast i16 %x to bfloat
   ret bfloat %a
diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll
index 2190e2fbccf288..37a87eea41ad36 100644
--- a/llvm/test/Bitcode/compatibility-3.6.ll
+++ b/llvm/test/Bitcode/compatibility-3.6.ll
@@ -645,7 +645,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll
index 7e59b5c1be6e2f..8de2132d7ec892 100644
--- a/llvm/test/Bitcode/compatibility-3.7.ll
+++ b/llvm/test/Bitcode/compatibility-3.7.ll
@@ -689,7 +689,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll
index ebd1f2fff8c94c..7f766aa34a005f 100644
--- a/llvm/test/Bitcode/compatibility-3.8.ll
+++ b/llvm/test/Bitcode/compatibility-3.8.ll
@@ -742,7 +742,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll
index c34f04ceb0de39..c8309175e063f0 100644
--- a/llvm/test/Bitcode/compatibility-3.9.ll
+++ b/llvm/test/Bitcode/compatibility-3.9.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll
index 05bffda1d117a3..adbd91ac6c7fe5 100644
--- a/llvm/test/Bitcode/compatibility-4.0.ll
+++ b/llvm/test/Bitcode/compatibility-4.0.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll
index 0c872289c62ba8..1b500da69568af 100644
--- a/llvm/test/Bitcode/compatibility-5.0.ll
+++ b/llvm/test/Bitcode/compatibility-5.0.ll
@@ -820,7 +820,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll
index 44c680885be34f..c1abbf0cda6eb9 100644
--- a/llvm/test/Bitcode/compatibility-6.0.ll
+++ b/llvm/test/Bitcode/compatibility-6.0.ll
@@ -830,7 +830,7 @@ define void @typesystem() {
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
   %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca <1 x i64>
   %t8 = alloca %opaquety*
   ; CHECK: %t8 = alloca ptr
 
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index c09eabde8ef87a..a849789da536ac 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1160,8 +1160,6 @@ define void @typesystem() {
   ; CHECK: %t5 = alloca x86_fp80
   %t6 = alloca ppc_fp128
   ; CHECK: %t6 = alloca ppc_fp128
-  %t7 = alloca x86_mmx
-  ; CHECK: %t7 = alloca x86_mmx
   %t8 = alloca ptr
   ; CHECK: %t8 = alloca ptr
   %t9 = alloca <4 x i32>
diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
index 69f733461efc77..ba40c5c4627d95 100644
--- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
+++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll
@@ -25,10 +25,8 @@ define void @test(<1 x i64> %c64, <1 x i64> %mask1, ptr %P) {
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
 entry:
-	%tmp4 = bitcast <1 x i64> %mask1 to x86_mmx		; <x86_mmx> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %c64 to x86_mmx		; <x86_mmx> [#uses=1]
-	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, ptr %P )
+	tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %mask1, <1 x i64> %c64, ptr %P )
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
diff --git a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
index 79b06ba836af29..6c586782420e15 100644
--- a/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
+++ b/llvm/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | FileCheck %s
 
-@R = external global x86_mmx		; <ptr> [#uses=1]
+@R = external global <1 x i64>		; <ptr> [#uses=1]
 
 define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
 ; CHECK-LABEL: foo:
@@ -14,13 +14,11 @@ define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
 ; CHECK-NEXT:    emms
 ; CHECK-NEXT:    retq
 entry:
-	%tmp4 = bitcast <1 x i64> %B to x86_mmx		; <<4 x i16>> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %A to x86_mmx		; <<4 x i16>> [#uses=1]
-	%tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 )		; <x86_mmx> [#uses=1]
-	store x86_mmx %tmp7, ptr @R
+	%tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w( <1 x i64> %A, <1 x i64> %B )		; <<1 x i64>> [#uses=1]
+	store <1 x i64> %tmp7, ptr @R
 	tail call void @llvm.x86.mmx.emms( )
 	ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
index d439e827e81994..0c792644fc5c8a 100644
--- a/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
+++ b/llvm/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
@@ -5,15 +5,15 @@ entry:
 	tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <x86_mmx> [#uses=1]
+	%tmp1 = tail call <1 x i64> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <<1 x i64>> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind 		; <i32> [#uses=1]
+	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef ) nounwind 		; <i32> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind 
+	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> undef, i32 undef, i32 %tmp3 ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind 		; <i32> [#uses=0]
+	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <1 x i64> %tmp1 ) nounwind 		; <i32> [#uses=0]
 	ret i32 undef
 }
diff --git a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
index 594edbaad29441..4a4477823a61d3 100644
--- a/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
+++ b/llvm/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
@@ -17,13 +17,13 @@ entry:
 	br i1 false, label %bb.nph144.split, label %bb133
 
 bb.nph144.split:		; preds = %entry
-        %tmp = bitcast <8 x i8> zeroinitializer to x86_mmx
-        %tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx
-	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, ptr null ) nounwind
+        %tmp = bitcast <8 x i8> zeroinitializer to <1 x i64>
+        %tmp2 = bitcast <8 x i8> zeroinitializer to <1 x i64>
+	tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp, <1 x i64> %tmp2, ptr null ) nounwind
 	unreachable
 
 bb133:		; preds = %entry
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
diff --git a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index ac86279ca6667e..20673a177ac31f 100644
--- a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -26,25 +26,44 @@ entry:
 
 ; This is how to get MMX instructions.
 
-define <2 x double> @a2(x86_mmx %x) nounwind {
+define <2 x double> @a2(<1 x i64> %x) nounwind {
 ; CHECK-LABEL: a2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvtpi2pd %mm0, %xmm0
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    movl 8(%ebp), %eax
+; CHECK-NEXT:    movl 12(%ebp), %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    cvtpi2pd (%esp), %xmm0
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
-  %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
+  %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %x)
   ret <2 x double> %y
 }
 
-define x86_mmx @b2(<2 x double> %x) nounwind {
+define <1 x i64> @b2(<2 x double> %x) nounwind {
 ; CHECK-LABEL: b2:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    andl $-8, %esp
+; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    cvttpd2pi %xmm0, %mm0
+; CHECK-NEXT:    movq %mm0, (%esp)
+; CHECK-NEXT:    movl (%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl %ebp, %esp
+; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
 entry:
-  %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
-  ret x86_mmx %y
+  %y = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
+  ret <1 x i64> %y
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>)
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>)
diff --git a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
index 306aeed1ace3e1..582ebb9bdcfd15 100644
--- a/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
+++ b/llvm/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll
@@ -3,14 +3,14 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-macosx10.6.6"
 
-%0 = type { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx }
+%0 = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
 
 define i32 @pixman_fill_mmx(ptr nocapture %bits, i32 %stride, i32 %bpp, i32 %x, i32 %y, i32 %width, i32 %height, i32 %xor) nounwind ssp {
 entry:
   %conv = zext i32 %xor to i64
   %shl = shl nuw i64 %conv, 32
   %or = or i64 %shl, %conv
-  %0 = bitcast i64 %or to x86_mmx
+  %0 = bitcast i64 %or to <1 x i64>
 ; CHECK:      movq [[MMXR:%mm[0-7],]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
@@ -18,7 +18,7 @@ entry:
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
 ; CHECK-NEXT: movq [[MMXR]] {{%mm[0-7]}}
-  %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(x86_mmx %0) nounwind, !srcloc !0
+  %1 = tail call %0 asm "movq\09\09$7,\09$0\0Amovq\09\09$7,\09$1\0Amovq\09\09$7,\09$2\0Amovq\09\09$7,\09$3\0Amovq\09\09$7,\09$4\0Amovq\09\09$7,\09$5\0Amovq\09\09$7,\09$6\0A", "=&y,=&y,=&y,=&y,=&y,=&y,=y,y,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %0) nounwind, !srcloc !0
   %asmresult = extractvalue %0 %1, 0
   %asmresult6 = extractvalue %0 %1, 1
   %asmresult7 = extractvalue %0 %1, 2
@@ -34,7 +34,7 @@ entry:
 ; CHECK-NEXT: movq {{%mm[0-7]}},
 ; CHECK-NEXT: movq {{%mm[0-7]}},
 ; CHECK-NEXT: movq {{%mm[0-7]}},
-  tail call void asm sideeffect "movq\09$1,\09  ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, x86_mmx %0, x86_mmx %asmresult, x86_mmx %asmresult6, x86_mmx %asmresult7, x86_mmx %asmresult8, x86_mmx %asmresult9, x86_mmx %asmresult10, x86_mmx %asmresult11) nounwind, !srcloc !1
+  tail call void asm sideeffect "movq\09$1,\09  ($0)\0Amovq\09$2,\09 8($0)\0Amovq\09$3,\0916($0)\0Amovq\09$4,\0924($0)\0Amovq\09$5,\0932($0)\0Amovq\09$6,\0940($0)\0Amovq\09$7,\0948($0)\0Amovq\09$8,\0956($0)\0A", "r,y,y,y,y,y,y,y,y,~{memory},~{dirflag},~{fpsr},~{flags}"(ptr undef, <1 x i64> %0, <1 x i64> %asmresult, <1 x i64> %asmresult6, <1 x i64> %asmresult7, <1 x i64> %asmresult8, <1 x i64> %asmresult9, <1 x i64> %asmresult10, <1 x i64> %asmresult11) nounwind, !srcloc !1
   tail call void @llvm.x86.mmx.emms() nounwind
   ret i32 1
 }
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index e41e9886a836b5..0bfd8921e8b42a 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1011,23 +1011,19 @@ define float @broadcast_lifetime() nounwind {
   ret float %7
 }
 
-define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
+define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: broadcast_x86_mmx:
 ; X64:       ## %bb.0: ## %bb
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT:    vmovq %rdi, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; X64-NEXT:    retq
 bb:
-  %tmp1 = bitcast x86_mmx %tmp to i64
+  %tmp1 = bitcast <1 x i64> %tmp to i64
   %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
   %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
   %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index 886c3057f82c57..c50af6968f5bb2 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1449,31 +1449,24 @@ eintry:
   ret void
 }
 
-define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
+define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
 ; X86-LABEL: broadcast_x86_mmx:
 ; X86:       ## %bb.0: ## %bb
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: broadcast_x86_mmx:
 ; X64-AVX2:       ## %bb.0: ## %bb
-; X64-AVX2-NEXT:    movdq2q %xmm0, %mm0
-; X64-AVX2-NEXT:    movq %mm0, %rax
-; X64-AVX2-NEXT:    vmovq %rax, %xmm0
+; X64-AVX2-NEXT:    vmovq %rdi, %xmm0
 ; X64-AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512VL-LABEL: broadcast_x86_mmx:
 ; X64-AVX512VL:       ## %bb.0: ## %bb
-; X64-AVX512VL-NEXT:    movdq2q %xmm0, %mm0
-; X64-AVX512VL-NEXT:    movq %mm0, %rax
-; X64-AVX512VL-NEXT:    vpbroadcastq %rax, %xmm0
+; X64-AVX512VL-NEXT:    vpbroadcastq %rdi, %xmm0
 ; X64-AVX512VL-NEXT:    retq
 bb:
-  %tmp1 = bitcast x86_mmx %tmp to i64
+  %tmp1 = bitcast <1 x i64> %tmp to i64
   %tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
   %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
   %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/bitcast-mmx.ll b/llvm/test/CodeGen/X86/bitcast-mmx.ll
index 061723a0966e2b..fe48a96a51d3ec 100644
--- a/llvm/test/CodeGen/X86/bitcast-mmx.ll
+++ b/llvm/test/CodeGen/X86/bitcast-mmx.ll
@@ -17,9 +17,9 @@ define i32 @t0(i64 %x) nounwind {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast i64 %x to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 -18)
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 -18)
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   %6 = bitcast i64 %5 to <2 x i32>
@@ -52,9 +52,9 @@ define i64 @t1(i64 %x, i32 %n) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast i64 %x to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %n)
-  %2 = bitcast x86_mmx %1 to i64
+  %0 = bitcast i64 %x to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %n)
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
@@ -88,11 +88,11 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %w, i32 0
   %1 = insertelement <2 x i32> %0, i32 0, i32 1
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %2, i32 %n)
-  %4 = bitcast i64 %x to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %4, x86_mmx %3)
-  %6 = bitcast x86_mmx %5 to i64
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %2, i32 %n)
+  %4 = bitcast i64 %x to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %4, <1 x i64> %3)
+  %6 = bitcast <1 x i64> %5 to i64
   ret i64 %6
 }
 
@@ -123,14 +123,14 @@ define i64 @t3(ptr %y, ptr %n) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %y, align 8
+  %0 = load <1 x i64>, ptr %y, align 8
   %1 = load i32, ptr %n, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
index 559560ac20f8af..aa637e7408f22a 100644
--- a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
+++ b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -6,9 +6,9 @@
 
   define <2 x i32> @test_paddw(<2 x i32> %a) nounwind readnone {
   entry:
-    %0 = bitcast <2 x i32> %a to x86_mmx
-    %1 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %0, x86_mmx %0)
-    %2 = bitcast x86_mmx %1 to <2 x i32>
+    %0 = bitcast <2 x i32> %a to <1 x i64>
+    %1 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %0, <1 x i64> %0)
+    %2 = bitcast <1 x i64> %1 to <2 x i32>
     ret <2 x i32> %2
   }
 
diff --git a/llvm/test/CodeGen/X86/fake-use-vector.ll b/llvm/test/CodeGen/X86/fake-use-vector.ll
index be1dc123f8023e..1995b42f31ccee 100644
--- a/llvm/test/CodeGen/X86/fake-use-vector.ll
+++ b/llvm/test/CodeGen/X86/fake-use-vector.ll
@@ -1,6 +1,5 @@
 ; assert in DAGlegalizer with fake use of 1-element vectors.
 ; RUN: llc -stop-after=finalize-isel -mtriple=x86_64-unknown-linux -filetype=asm -o - %s | FileCheck %s
-; XFAIL: *
 ; ModuleID = 't2.cpp'
 ; source_filename = "t2.cpp"
 ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/CodeGen/X86/fast-isel-bc.ll b/llvm/test/CodeGen/X86/fast-isel-bc.ll
index 0fbc9fab056814..64bdfd6d4f8632 100644
--- a/llvm/test/CodeGen/X86/fast-isel-bc.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-bc.ll
@@ -4,7 +4,7 @@
 
 ; PR4684
 
-declare void @func2(x86_mmx)
+declare void @func2(<1 x i64>)
 
 ; This isn't spectacular, but it's MMX code at -O0...
 
@@ -12,7 +12,11 @@ define void @func1() nounwind {
 ; X86-LABEL: func1:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 ## mm0 = 0x200000000
+; X86-NEXT:    movl $2, %edx
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %esp, %eax
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    calll _func2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
@@ -20,12 +24,11 @@ define void @func1() nounwind {
 ; X64-LABEL: func1:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 ## mm0 = 0x200000000
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movabsq $8589934592, %rdi ## imm = 0x200000000
 ; X64-NEXT:    callq _func2
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast <2 x i32> <i32 0, i32 2> to x86_mmx
-  call void @func2(x86_mmx %tmp0)
+  %tmp0 = bitcast <2 x i32> <i32 0, i32 2> to <1 x i64>
+  call void @func2(<1 x i64> %tmp0)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
index c13fdae540d0b8..3b1a8f541b4902 100644
--- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -104,12 +104,12 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
 ; ALL-NEXT:    movntq %mm0, (%rsi)
 ; ALL-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a0
-  %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
-  store x86_mmx %1, ptr %a1, align 8, !nontemporal !1
+  %0 = load <1 x i64>, ptr %a0
+  %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
+  store <1 x i64> %1, ptr %a1, align 8, !nontemporal !1
   ret void
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 ;
 ; 128-bit Vector Stores
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index a4dbb10e0d7a04..439d7efc2d7551 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -10,30 +10,29 @@ define void @t3() nounwind  {
 ; X86-64-LABEL: t3:
 ; X86-64:       ## %bb.0:
 ; X86-64-NEXT:    movq _g_v8qi@GOTPCREL(%rip), %rax
-; X86-64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    movq (%rax), %rdi
+; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %tmp3 = load <8 x i8>, ptr @g_v8qi, align 8
-  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
   ret void
 }
 
-define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
+define void @t4(<1 x i64> %v1, <1 x i64> %v2) nounwind  {
 ; X86-64-LABEL: t4:
 ; X86-64:       ## %bb.0:
-; X86-64-NEXT:    movdq2q %xmm1, %mm0
-; X86-64-NEXT:    movdq2q %xmm0, %mm1
-; X86-64-NEXT:    movq2dq %mm1, %xmm1
-; X86-64-NEXT:    movq2dq %mm0, %xmm0
-; X86-64-NEXT:    paddb %xmm1, %xmm0
-; X86-64-NEXT:    movb $1, %al
+; X86-64-NEXT:    movq %rdi, %xmm0
+; X86-64-NEXT:    movq %rsi, %xmm1
+; X86-64-NEXT:    paddb %xmm0, %xmm1
+; X86-64-NEXT:    movq %xmm1, %rdi
+; X86-64-NEXT:    xorl %eax, %eax
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
-  %v1a = bitcast x86_mmx %v1 to <8 x i8>
-  %v2b = bitcast x86_mmx %v2 to <8 x i8>
+  %v1a = bitcast <1 x i64> %v1 to <8 x i8>
+  %v2b = bitcast <1 x i64> %v2 to <8 x i8>
   %tmp3 = add <8 x i8> %v1a, %v2b
-  %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp3a = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp4 = tail call i32 (...) @pass_v8qi( <1 x i64> %tmp3a ) nounwind
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing.ll b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
index af116a2ac281b3..d933149c5e027e 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
@@ -8,26 +8,28 @@
 ; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
 ; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs.
 
-@u1 = external global x86_mmx
+@u1 = external global <1 x i64>
 
-define void @t1(x86_mmx %v1) nounwind  {
+define void @t1(<1 x i64> %v1) nounwind  {
 ; X86-32-LABEL: t1:
 ; X86-32:       ## %bb.0:
-; X86-32-NEXT:    movl L_u1$non_lazy_ptr, %eax
-; X86-32-NEXT:    movq %mm0, (%eax)
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT:    movl L_u1$non_lazy_ptr, %edx
+; X86-32-NEXT:    movl %ecx, 4(%edx)
+; X86-32-NEXT:    movl %eax, (%edx)
 ; X86-32-NEXT:    retl
 ;
 ; X86-64-LABEL: t1:
 ; X86-64:       ## %bb.0:
-; X86-64-NEXT:    movdq2q %xmm0, %mm0
 ; X86-64-NEXT:    movq _u1@GOTPCREL(%rip), %rax
-; X86-64-NEXT:    movq %mm0, (%rax)
+; X86-64-NEXT:    movq %rdi, (%rax)
 ; X86-64-NEXT:    retq
-	store x86_mmx %v1, ptr @u1, align 8
+	store <1 x i64> %v1, ptr @u1, align 8
 	ret void
 }
 
-@u2 = external global x86_mmx
+@u2 = external global <1 x i64>
 
 define void @t2(<1 x i64> %v1) nounwind  {
 ; X86-32-LABEL: t2:
@@ -44,7 +46,6 @@ define void @t2(<1 x i64> %v1) nounwind  {
 ; X86-64-NEXT:    movq _u2@GOTPCREL(%rip), %rax
 ; X86-64-NEXT:    movq %rdi, (%rax)
 ; X86-64-NEXT:    retq
-        %tmp = bitcast <1 x i64> %v1 to x86_mmx
-	store x86_mmx %tmp, ptr @u2, align 8
+	store <1 x i64> %v1, ptr @u2, align 8
 	ret void
 }
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index f9ef3dda78cfcf..73d459ba770264 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -18,8 +18,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    paddsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusb (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubb %xmm1, %xmm0
 ; X86-NEXT:    movdq2q %xmm0, %mm0
@@ -27,8 +27,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    psubsb (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusb (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -58,8 +58,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    paddsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubb %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -67,8 +67,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    psubsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -88,53 +88,53 @@ define void @test0(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
-  %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <8 x i8>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <8 x i8>
   %tmp4 = add <8 x i8> %tmp1a, %tmp3a
-  %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp7 = load x86_mmx, ptr %B
-  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %tmp4a, x86_mmx %tmp7)
-  store x86_mmx %tmp12, ptr %A
-  %tmp16 = load x86_mmx, ptr %B
-  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %tmp12, x86_mmx %tmp16)
-  store x86_mmx %tmp21, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
-  %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
+  %tmp4a = bitcast <8 x i8> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp7 = load <1 x i64>, ptr %B
+  %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %tmp4a, <1 x i64> %tmp7)
+  store <1 x i64> %tmp12, ptr %A
+  %tmp16 = load <1 x i64>, ptr %B
+  %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %tmp12, <1 x i64> %tmp16)
+  store <1 x i64> %tmp21, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp21a = bitcast <1 x i64> %tmp21 to <8 x i8>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <8 x i8>
   %tmp28 = sub <8 x i8> %tmp21a, %tmp27a
-  %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp31 = load x86_mmx, ptr %B
-  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %tmp28a, x86_mmx %tmp31)
-  store x86_mmx %tmp36, ptr %A
-  %tmp40 = load x86_mmx, ptr %B
-  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %tmp36, x86_mmx %tmp40)
-  store x86_mmx %tmp45, ptr %A
-  %tmp51 = load x86_mmx, ptr %B
-  %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
-  %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
+  %tmp28a = bitcast <8 x i8> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp31 = load <1 x i64>, ptr %B
+  %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %tmp28a, <1 x i64> %tmp31)
+  store <1 x i64> %tmp36, ptr %A
+  %tmp40 = load <1 x i64>, ptr %B
+  %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %tmp36, <1 x i64> %tmp40)
+  store <1 x i64> %tmp45, ptr %A
+  %tmp51 = load <1 x i64>, ptr %B
+  %tmp45a = bitcast <1 x i64> %tmp45 to <8 x i8>
+  %tmp51a = bitcast <1 x i64> %tmp51 to <8 x i8>
   %tmp52 = mul <8 x i8> %tmp45a, %tmp51a
-  %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
-  store x86_mmx %tmp52a, ptr %A
-  %tmp57 = load x86_mmx, ptr %B
-  %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
+  %tmp52a = bitcast <8 x i8> %tmp52 to <1 x i64>
+  store <1 x i64> %tmp52a, ptr %A
+  %tmp57 = load <1 x i64>, ptr %B
+  %tmp57a = bitcast <1 x i64> %tmp57 to <8 x i8>
   %tmp58 = and <8 x i8> %tmp52, %tmp57a
-  %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
-  store x86_mmx %tmp58a, ptr %A
-  %tmp63 = load x86_mmx, ptr %B
-  %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
+  %tmp58a = bitcast <8 x i8> %tmp58 to <1 x i64>
+  store <1 x i64> %tmp58a, ptr %A
+  %tmp63 = load <1 x i64>, ptr %B
+  %tmp63a = bitcast <1 x i64> %tmp63 to <8 x i8>
   %tmp64 = or <8 x i8> %tmp58, %tmp63a
-  %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
-  store x86_mmx %tmp64a, ptr %A
-  %tmp69 = load x86_mmx, ptr %B
-  %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
-  %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
+  %tmp64a = bitcast <8 x i8> %tmp64 to <1 x i64>
+  store <1 x i64> %tmp64a, ptr %A
+  %tmp69 = load <1 x i64>, ptr %B
+  %tmp69a = bitcast <1 x i64> %tmp69 to <8 x i8>
+  %tmp64b = bitcast <1 x i64> %tmp64a to <8 x i8>
   %tmp70 = xor <8 x i8> %tmp64b, %tmp69a
-  %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
-  store x86_mmx %tmp70a, ptr %A
+  %tmp70a = bitcast <8 x i8> %tmp70 to <1 x i64>
+  store <1 x i64> %tmp70a, ptr %A
   tail call void @llvm.x86.mmx.emms()
   ret void
 }
@@ -196,42 +196,42 @@ define void @test1(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
-  %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <2 x i32>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <2 x i32>
   %tmp4 = add <2 x i32> %tmp1a, %tmp3a
-  %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp9 = load x86_mmx, ptr %B
-  %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
+  %tmp4a = bitcast <2 x i32> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp9 = load <1 x i64>, ptr %B
+  %tmp9a = bitcast <1 x i64> %tmp9 to <2 x i32>
   %tmp10 = sub <2 x i32> %tmp4, %tmp9a
-  %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
-  store x86_mmx %tmp10a, ptr %A
-  %tmp15 = load x86_mmx, ptr %B
-  %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
-  %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+  %tmp10a = bitcast <2 x i32> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp10a, ptr %A
+  %tmp15 = load <1 x i64>, ptr %B
+  %tmp10b = bitcast <1 x i64> %tmp10a to <2 x i32>
+  %tmp15a = bitcast <1 x i64> %tmp15 to <2 x i32>
   %tmp16 = mul <2 x i32> %tmp10b, %tmp15a
-  %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
-  store x86_mmx %tmp16a, ptr %A
-  %tmp21 = load x86_mmx, ptr %B
-  %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
-  %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
+  %tmp16a = bitcast <2 x i32> %tmp16 to <1 x i64>
+  store <1 x i64> %tmp16a, ptr %A
+  %tmp21 = load <1 x i64>, ptr %B
+  %tmp16b = bitcast <1 x i64> %tmp16a to <2 x i32>
+  %tmp21a = bitcast <1 x i64> %tmp21 to <2 x i32>
   %tmp22 = and <2 x i32> %tmp16b, %tmp21a
-  %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
-  store x86_mmx %tmp22a, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
-  %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
+  %tmp22a = bitcast <2 x i32> %tmp22 to <1 x i64>
+  store <1 x i64> %tmp22a, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp22b = bitcast <1 x i64> %tmp22a to <2 x i32>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <2 x i32>
   %tmp28 = or <2 x i32> %tmp22b, %tmp27a
-  %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp33 = load x86_mmx, ptr %B
-  %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
-  %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
+  %tmp28a = bitcast <2 x i32> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp33 = load <1 x i64>, ptr %B
+  %tmp28b = bitcast <1 x i64> %tmp28a to <2 x i32>
+  %tmp33a = bitcast <1 x i64> %tmp33 to <2 x i32>
   %tmp34 = xor <2 x i32> %tmp28b, %tmp33a
-  %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
-  store x86_mmx %tmp34a, ptr %A
+  %tmp34a = bitcast <2 x i32> %tmp34 to <1 x i64>
+  store <1 x i64> %tmp34a, ptr %A
   tail call void @llvm.x86.mmx.emms( )
   ret void
 }
@@ -239,8 +239,13 @@ entry:
 define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    paddw %xmm0, %xmm1
@@ -249,8 +254,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    paddsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    paddusw (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    psubw %xmm1, %xmm0
 ; X86-NEXT:    movdq2q %xmm0, %mm0
@@ -258,8 +263,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    psubsw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    psubusw (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X86-NEXT:    pmullw %xmm0, %xmm1
 ; X86-NEXT:    movdq2q %xmm1, %mm0
@@ -267,18 +272,26 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X86-NEXT:    pmulhw (%ecx), %mm0
 ; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    pmaddwd (%ecx), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movq %mm0, (%eax)
-; X86-NEXT:    movq2dq %mm0, %xmm0
-; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    andps %xmm0, %xmm1
-; X86-NEXT:    movlps %xmm1, (%eax)
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    orps %xmm1, %xmm0
-; X86-NEXT:    movlps %xmm0, (%eax)
-; X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    xorps %xmm0, %xmm1
-; X86-NEXT:    movlps %xmm1, (%eax)
+; X86-NEXT:    andl 4(%ecx), %esi
+; X86-NEXT:    movd %esi, %xmm0
+; X86-NEXT:    andl (%ecx), %edx
+; X86-NEXT:    movd %edx, %xmm1
+; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT:    movq %xmm1, (%eax)
+; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    por %xmm1, %xmm0
+; X86-NEXT:    movq %xmm0, (%eax)
+; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    pxor %xmm0, %xmm1
+; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    emms
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test2:
@@ -291,8 +304,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    paddsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubw %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -300,8 +313,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    psubsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    pmullw %xmm0, %xmm1
 ; X64-NEXT:    movdq2q %xmm1, %mm0
@@ -309,76 +322,75 @@ define void @test2(ptr %A, ptr %B) nounwind {
 ; X64-NEXT:    pmulhw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    pmaddwd (%rsi), %mm0
+; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    movq %mm0, (%rdi)
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    andps %xmm0, %xmm1
-; X64-NEXT:    movlps %xmm1, (%rdi)
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    orps %xmm1, %xmm0
-; X64-NEXT:    movlps %xmm0, (%rdi)
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    xorps %xmm0, %xmm1
-; X64-NEXT:    movlps %xmm1, (%rdi)
+; X64-NEXT:    andq (%rsi), %rax
+; X64-NEXT:    movq %rax, %xmm0
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    por %xmm0, %xmm1
+; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = load x86_mmx, ptr %A
-  %tmp3 = load x86_mmx, ptr %B
-  %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
-  %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
+  %tmp1 = load <1 x i64>, ptr %A
+  %tmp3 = load <1 x i64>, ptr %B
+  %tmp1a = bitcast <1 x i64> %tmp1 to <4 x i16>
+  %tmp3a = bitcast <1 x i64> %tmp3 to <4 x i16>
   %tmp4 = add <4 x i16> %tmp1a, %tmp3a
-  %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
-  store x86_mmx %tmp4a, ptr %A
-  %tmp7 = load x86_mmx, ptr %B
-  %tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %tmp4a, x86_mmx %tmp7)
-  store x86_mmx %tmp12, ptr %A
-  %tmp16 = load x86_mmx, ptr %B
-  %tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp12, x86_mmx %tmp16)
-  store x86_mmx %tmp21, ptr %A
-  %tmp27 = load x86_mmx, ptr %B
-  %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
-  %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
+  %tmp4a = bitcast <4 x i16> %tmp4 to <1 x i64>
+  store <1 x i64> %tmp4a, ptr %A
+  %tmp7 = load <1 x i64>, ptr %B
+  %tmp12 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %tmp4a, <1 x i64> %tmp7)
+  store <1 x i64> %tmp12, ptr %A
+  %tmp16 = load <1 x i64>, ptr %B
+  %tmp21 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %tmp12, <1 x i64> %tmp16)
+  store <1 x i64> %tmp21, ptr %A
+  %tmp27 = load <1 x i64>, ptr %B
+  %tmp21a = bitcast <1 x i64> %tmp21 to <4 x i16>
+  %tmp27a = bitcast <1 x i64> %tmp27 to <4 x i16>
   %tmp28 = sub <4 x i16> %tmp21a, %tmp27a
-  %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
-  store x86_mmx %tmp28a, ptr %A
-  %tmp31 = load x86_mmx, ptr %B
-  %tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %tmp28a, x86_mmx %tmp31)
-  store x86_mmx %tmp36, ptr %A
-  %tmp40 = load x86_mmx, ptr %B
-  %tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %tmp36, x86_mmx %tmp40)
-  store x86_mmx %tmp45, ptr %A
-  %tmp51 = load x86_mmx, ptr %B
-  %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
-  %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
+  %tmp28a = bitcast <4 x i16> %tmp28 to <1 x i64>
+  store <1 x i64> %tmp28a, ptr %A
+  %tmp31 = load <1 x i64>, ptr %B
+  %tmp36 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %tmp28a, <1 x i64> %tmp31)
+  store <1 x i64> %tmp36, ptr %A
+  %tmp40 = load <1 x i64>, ptr %B
+  %tmp45 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %tmp36, <1 x i64> %tmp40)
+  store <1 x i64> %tmp45, ptr %A
+  %tmp51 = load <1 x i64>, ptr %B
+  %tmp45a = bitcast <1 x i64> %tmp45 to <4 x i16>
+  %tmp51a = bitcast <1 x i64> %tmp51 to <4 x i16>
   %tmp52 = mul <4 x i16> %tmp45a, %tmp51a
-  %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
-  store x86_mmx %tmp52a, ptr %A
-  %tmp55 = load x86_mmx, ptr %B
-  %tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %tmp52a, x86_mmx %tmp55)
-  store x86_mmx %tmp60, ptr %A
-  %tmp64 = load x86_mmx, ptr %B
-  %tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %tmp60, x86_mmx %tmp64)
-  %tmp70 = bitcast x86_mmx %tmp69 to x86_mmx
-  store x86_mmx %tmp70, ptr %A
-  %tmp75 = load x86_mmx, ptr %B
-  %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
-  %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
+  %tmp52a = bitcast <4 x i16> %tmp52 to <1 x i64>
+  store <1 x i64> %tmp52a, ptr %A
+  %tmp55 = load <1 x i64>, ptr %B
+  %tmp60 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %tmp52a, <1 x i64> %tmp55)
+  store <1 x i64> %tmp60, ptr %A
+  %tmp64 = load <1 x i64>, ptr %B
+  %tmp69 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %tmp60, <1 x i64> %tmp64)
+  store <1 x i64> %tmp69, ptr %A
+  %tmp75 = load <1 x i64>, ptr %B
+  %tmp70a = bitcast <1 x i64> %tmp69 to <4 x i16>
+  %tmp75a = bitcast <1 x i64> %tmp75 to <4 x i16>
   %tmp76 = and <4 x i16> %tmp70a, %tmp75a
-  %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
-  store x86_mmx %tmp76a, ptr %A
-  %tmp81 = load x86_mmx, ptr %B
-  %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
-  %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
+  %tmp76a = bitcast <4 x i16> %tmp76 to <1 x i64>
+  store <1 x i64> %tmp76a, ptr %A
+  %tmp81 = load <1 x i64>, ptr %B
+  %tmp76b = bitcast <1 x i64> %tmp76a to <4 x i16>
+  %tmp81a = bitcast <1 x i64> %tmp81 to <4 x i16>
   %tmp82 = or <4 x i16> %tmp76b, %tmp81a
-  %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
-  store x86_mmx %tmp82a, ptr %A
-  %tmp87 = load x86_mmx, ptr %B
-  %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
-  %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
+  %tmp82a = bitcast <4 x i16> %tmp82 to <1 x i64>
+  store <1 x i64> %tmp82a, ptr %A
+  %tmp87 = load <1 x i64>, ptr %B
+  %tmp82b = bitcast <1 x i64> %tmp82a to <4 x i16>
+  %tmp87a = bitcast <1 x i64> %tmp87 to <4 x i16>
   %tmp88 = xor <4 x i16> %tmp82b, %tmp87a
-  %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
-  store x86_mmx %tmp88a, ptr %A
+  %tmp88a = bitcast <4 x i16> %tmp88 to <1 x i64>
+  store <1 x i64> %tmp88a, ptr %A
   tail call void @llvm.x86.mmx.emms( )
   ret void
 }
@@ -574,10 +586,10 @@ define void @ti8a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -597,10 +609,10 @@ define void @ti16a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -620,10 +632,10 @@ define void @ti32a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -643,10 +655,10 @@ define void @ti64a(double %a, double %b) nounwind {
 ; X64-NEXT:    movq %mm1, 0
 ; X64-NEXT:    retq
 entry:
-  %tmp1 = bitcast double %a to x86_mmx
-  %tmp2 = bitcast double %b to x86_mmx
-  %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
-  store x86_mmx %tmp3, ptr null
+  %tmp1 = bitcast double %a to <1 x i64>
+  %tmp2 = bitcast double %b to <1 x i64>
+  %tmp3 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %tmp1, <1 x i64> %tmp2)
+  store <1 x i64> %tmp3, ptr null
   ret void
 }
 
@@ -674,28 +686,28 @@ define i64 @pr43922() nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx bitcast (<2 x i32> <i32 2058005162, i32 2058005162> to x86_mmx), i32 268435456)
-  %1 = bitcast x86_mmx %0 to i64
+  %0 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> bitcast (<2 x i32> <i32 2058005162, i32 2058005162> to <1 x i64>), i32 268435456)
+  %1 = bitcast <1 x i64> %0 to i64
   ret i64 %1
 }
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32)
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
 
 declare void @llvm.x86.mmx.emms()
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
index 0fa7b24ff445aa..fb2517f5a891be 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -mtriple=x86_64-- -passes=early-cse -earlycse-debug-hash < %s -S | FileCheck %s
 
-; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
+; CHECK: @foo(<1 x i64> zeroinitializer)
 
 define void @bar() {
 entry:
-  %0 = bitcast double 0.0 to x86_mmx
-  %1 = call x86_mmx @foo(x86_mmx %0)
+  %0 = bitcast double 0.0 to <1 x i64>
+  %1 = call <1 x i64> @foo(<1 x i64> %0)
   ret void
 }
 
-declare x86_mmx @foo(x86_mmx)
+declare <1 x i64> @foo(<1 x i64>)
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast.ll b/llvm/test/CodeGen/X86/mmx-bitcast.ll
index f914b8622fcf4b..5e5be820dd5b42 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast.ll
@@ -8,9 +8,9 @@ define i64 @t0(ptr %p) {
 ; CHECK-NEXT:    paddq %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -21,9 +21,9 @@ define i64 @t1(ptr %p) {
 ; CHECK-NEXT:    paddd %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -34,9 +34,9 @@ define i64 @t2(ptr %p) {
 ; CHECK-NEXT:    paddw %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
@@ -47,29 +47,27 @@ define i64 @t3(ptr %p) {
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %t = load x86_mmx, ptr %p
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
-  %s = bitcast x86_mmx %u to i64
+  %t = load <1 x i64>, ptr %p
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %t)
+  %s = bitcast <1 x i64> %u to i64
   ret i64 %s
 }
 
-@R = external global x86_mmx
+@R = external global <1 x i64>
 
 define void @t4(<1 x i64> %A, <1 x i64> %B) {
 ; CHECK-LABEL: t4:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq %rdi, %mm0
-; CHECK-NEXT:    movq %rsi, %mm1
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
 ; CHECK-NEXT:    paddusw %mm0, %mm1
 ; CHECK-NEXT:    movq _R@GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movq %mm1, (%rax)
 ; CHECK-NEXT:    emms
 ; CHECK-NEXT:    retq
 entry:
-  %tmp2 = bitcast <1 x i64> %A to x86_mmx
-  %tmp3 = bitcast <1 x i64> %B to x86_mmx
-  %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %tmp2, x86_mmx %tmp3)
-  store x86_mmx %tmp7, ptr @R
+  %tmp7 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %A, <1 x i64> %B)
+  store <1 x i64> %tmp7, ptr @R
   tail call void @llvm.x86.mmx.emms()
   ret void
 }
@@ -88,7 +86,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone {
   ret i64 %conv
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
 
 define <1 x i64> @t6(i64 %t) {
 ; CHECK-LABEL: t6:
@@ -98,16 +96,14 @@ define <1 x i64> @t6(i64 %t) {
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %t1 = insertelement <1 x i64> undef, i64 %t, i32 0
-  %t0 = bitcast <1 x i64> %t1 to x86_mmx
-  %t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
-  %t3 = bitcast x86_mmx %t2 to <1 x i64>
-  ret <1 x i64> %t3
+  %t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48)
+  ret <1 x i64> %t2
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
 
diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll
index b919c9a33ea2f9..d8a010bacc683d 100644
--- a/llvm/test/CodeGen/X86/mmx-build-vector.ll
+++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll
@@ -8,7 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2    | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f | FileCheck %s --check-prefix=X64
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 
 ;
 ; v2i32
@@ -35,9 +35,9 @@ define void @build_v2i32_01(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -58,9 +58,9 @@ define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32   0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -92,9 +92,9 @@ define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 undef, i32 0
   %2 = insertelement <2 x i32>    %1, i32   %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -119,9 +119,9 @@ define void @build_v2i32_z1(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32   0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a1, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -153,9 +153,9 @@ define void @build_v2i32_00(ptr%p0, i32 %a0, i32 %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
   %2 = insertelement <2 x i32>    %1, i32 %a0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -194,9 +194,9 @@ define void @build_v4i16_0123(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16 %a2, i32 2
   %4 = insertelement <4 x i16>    %3, i16 %a3, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -229,9 +229,9 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16   0, i32 2
   %4 = insertelement <4 x i16>    %3, i16   0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -254,9 +254,9 @@ define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 undef, i32 1
   %3 = insertelement <4 x i16>    %2, i16 undef, i32 2
   %4 = insertelement <4 x i16>    %3, i16     0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -281,9 +281,9 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16     0, i32 1
   %3 = insertelement <4 x i16>    %2, i16 undef, i32 2
   %4 = insertelement <4 x i16>    %3, i16     0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -316,9 +316,9 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16   %a1, i32 1
   %3 = insertelement <4 x i16>    %2, i16   %a2, i32 2
   %4 = insertelement <4 x i16>    %3, i16 undef, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -353,9 +353,9 @@ define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi
   %2 = insertelement <4 x i16>    %1, i16 undef, i32 1
   %3 = insertelement <4 x i16>    %2, i16   %a0, i32 2
   %4 = insertelement <4 x i16>    %3, i16   %a0, i32 3
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  store x86_mmx %6, ptr%p0
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  store <1 x i64> %6, ptr%p0
   ret void
 }
 
@@ -414,9 +414,9 @@ define void @build_v8i8_01234567(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8 %a5, i32 5
   %7  = insertelement <8 x i8>    %6, i8 %a6, i32 6
   %8  = insertelement <8 x i8>    %7, i8 %a7, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -469,9 +469,9 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8   %a5, i32 5
   %7  = insertelement <8 x i8>    %6, i8    0,  i32 6
   %8  = insertelement <8 x i8>    %7, i8   %a7, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -522,9 +522,9 @@ define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 undef, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -551,9 +551,9 @@ define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8     0, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -582,9 +582,9 @@ define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8     0, i32 5
   %7  = insertelement <8 x i8>    %6, i8     0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 undef, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -626,9 +626,9 @@ define void @build_v8i8_00000000(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
   %6  = insertelement <8 x i8>    %5, i8 %a0, i32 5
   %7  = insertelement <8 x i8>    %6, i8 %a0, i32 6
   %8  = insertelement <8 x i8>    %7, i8 %a0, i32 7
-  %9  = bitcast <8 x i8> %8 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %9, x86_mmx %9)
-  store x86_mmx %10, ptr%p0
+  %9  = bitcast <8 x i8> %8 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %9, <1 x i64> %9)
+  store <1 x i64> %10, ptr%p0
   ret void
 }
 
@@ -669,9 +669,9 @@ define void @build_v2f32_01(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -707,9 +707,9 @@ define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float 0.0, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -742,9 +742,9 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float undef, i32 0
   %2 = insertelement <2 x float>    %1, float   %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -780,9 +780,9 @@ define void @build_v2f32_z1(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float 0.0, i32 0
   %2 = insertelement <2 x float>    %1, float %a1, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
 
@@ -815,8 +815,8 @@ define void @build_v2f32_00(ptr%p0, float %a0, float %a1) nounwind {
 ; X64-NEXT:    retq
   %1 = insertelement <2 x float> undef, float %a0, i32 0
   %2 = insertelement <2 x float>    %1, float %a0, i32 1
-  %3 = bitcast <2 x float> %2 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
-  store x86_mmx %4, ptr%p0
+  %3 = bitcast <2 x float> %2 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %3, <1 x i64> %3)
+  store <1 x i64> %4, ptr%p0
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/mmx-coalescing.ll b/llvm/test/CodeGen/X86/mmx-coalescing.ll
index dac526fe20bbf0..589f5af4bb4d64 100644
--- a/llvm/test/CodeGen/X86/mmx-coalescing.ll
+++ b/llvm/test/CodeGen/X86/mmx-coalescing.ll
@@ -42,9 +42,9 @@ entry:
   %SA2 = getelementptr inbounds %SA, ptr %pSA, i64 0, i32 4
   %v3 = load ptr, ptr %SA2, align 8
   %v4 = bitcast <1 x i64> %v0 to <4 x i16>
-  %v5 = bitcast <4 x i16> %v4 to x86_mmx
-  %v6 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v5, i8 -18)
-  %v7 = bitcast x86_mmx %v6 to <4 x i16>
+  %v5 = bitcast <4 x i16> %v4 to <1 x i64>
+  %v6 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v5, i8 -18)
+  %v7 = bitcast <1 x i64> %v6 to <4 x i16>
   %v8 = bitcast <4 x i16> %v7 to <1 x i64>
   %v9 = extractelement <1 x i64> %v8, i32 0
   %v10 = bitcast i64 %v9 to <2 x i32>
@@ -55,18 +55,18 @@ entry:
 if.A:
   %pa = phi <1 x i64> [ %v8, %entry ], [ %vx, %if.C ]
   %v17 = extractelement <1 x i64> %pa, i32 0
-  %v18 = bitcast i64 %v17 to x86_mmx
-  %v19 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %v18, i32 %B) #2
-  %v20 = bitcast x86_mmx %v19 to i64
+  %v18 = bitcast i64 %v17 to <1 x i64>
+  %v19 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %v18, i32 %B) #2
+  %v20 = bitcast <1 x i64> %v19 to i64
   %v21 = insertelement <1 x i64> undef, i64 %v20, i32 0
   %cmp3 = icmp eq i64 %v20, 0
   br i1 %cmp3, label %if.C, label %merge
 
 if.B:
   %v34 = bitcast <1 x i64> %v8 to <4 x i16>
-  %v35 = bitcast <4 x i16> %v34 to x86_mmx
-  %v36 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v35, i8 -18)
-  %v37 = bitcast x86_mmx %v36 to <4 x i16>
+  %v35 = bitcast <4 x i16> %v34 to <1 x i64>
+  %v36 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v35, i8 -18)
+  %v37 = bitcast <1 x i64> %v36 to <4 x i16>
   %v38 = bitcast <4 x i16> %v37 to <1 x i64>
   br label %if.C
 
@@ -80,9 +80,9 @@ if.C:
 merge:
   %vy = phi <1 x i64> [ %v21, %if.A ], [ %vx, %if.C ]
   %v130 = bitcast <1 x i64> %vy to <4 x i16>
-  %v131 = bitcast <4 x i16> %v130 to x86_mmx
-  %v132 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v131, i8 -18)
-  %v133 = bitcast x86_mmx %v132 to <4 x i16>
+  %v131 = bitcast <4 x i16> %v130 to <1 x i64>
+  %v132 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v131, i8 -18)
+  %v133 = bitcast <1 x i64> %v132 to <4 x i16>
   %v134 = bitcast <4 x i16> %v133 to <1 x i64>
   %v135 = extractelement <1 x i64> %v134, i32 0
   %v136 = bitcast i64 %v135 to <2 x i32>
@@ -91,5 +91,5 @@ merge:
 }
 
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll
index c09c417c11c966..51a71dab37f6da 100644
--- a/llvm/test/CodeGen/X86/mmx-cvt.ll
+++ b/llvm/test/CodeGen/X86/mmx-cvt.ll
@@ -8,20 +8,10 @@
 define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvt_v2f64_v2i32:
@@ -33,9 +23,9 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -44,20 +34,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvtt_v2f64_v2i32:
@@ -69,9 +49,9 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -80,20 +60,10 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f64_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttpd2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v2f64_v2i32:
@@ -103,9 +73,9 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    retq
   %3 = fptosi <2 x double> %0 to <2 x i32>
-  %4 = bitcast <2 x i32> %3 to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
-  %6 = bitcast x86_mmx %5 to i64
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %4, <1 x i64> %4)
+  %6 = bitcast <1 x i64> %5 to i64
   %7 = insertelement <1 x i64> undef, i64 %6, i32 0
   store <1 x i64> %7, ptr %1
   ret void
@@ -114,20 +84,10 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
 define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvtps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvt_v2f32_v2i32:
@@ -139,9 +99,9 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -150,20 +110,10 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: cvtt_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cvtt_v2f32_v2i32:
@@ -175,9 +125,9 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -186,20 +136,10 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v4f32_v4i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v4f32_v4i32:
@@ -210,9 +150,9 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 ; X64-NEXT:    retq
   %3 = fptosi <4 x float> %0 to <4 x i32>
   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %5 = bitcast <2 x i32> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
-  %7 = bitcast x86_mmx %6 to i64
+  %5 = bitcast <2 x i32> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %5)
+  %7 = bitcast <1 x i64> %6 to i64
   %8 = insertelement <1 x i64> undef, i64 %7, i32 0
   store <1 x i64> %8, ptr %1
   ret void
@@ -221,20 +161,10 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
 define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
 ; X86-LABEL: fptosi_v2f32_v2i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cvttps2pi %xmm0, %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movq %mm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptosi_v2f32_v2i32:
@@ -246,9 +176,9 @@ define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
   %3 = fptosi <4 x float> %0 to <4 x i32>
   %4 = bitcast <4 x i32> %3 to <2 x i64>
   %5 = extractelement <2 x i64> %4, i32 0
-  %6 = bitcast i64 %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
-  %8 = bitcast x86_mmx %7 to i64
+  %6 = bitcast i64 %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %6, <1 x i64> %6)
+  %8 = bitcast <1 x i64> %7 to i64
   %9 = insertelement <1 x i64> undef, i64 %8, i32 0
   store <1 x i64> %9, ptr %1
   ret void
@@ -280,9 +210,9 @@ define <2 x double> @sitofp_v2i32_v2f64(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to i64
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = insertelement <2 x i64> undef, i64 %4, i32 0
   %6 = bitcast <2 x i64> %5 to <4 x i32>
   %7 = shufflevector <4 x i32> %6, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
@@ -307,9 +237,9 @@ define <4 x float> @sitofp_v2i32_v2f32(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to <2 x i32>
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to <2 x i32>
   %5 = shufflevector <2 x i32> %4, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = sitofp <4 x i32> %5 to <4 x float>
   ret <4 x float> %6
@@ -339,9 +269,9 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind {
 ; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %2 = load x86_mmx, ptr %0, align 8
-  %3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to i64
+  %2 = load <1 x i64>, ptr %0, align 8
+  %3 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %2, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = insertelement <2 x i64> undef, i64 %4, i32 0
   %6 = insertelement <2 x i64> %5, i64 0, i32 1
   %7 = bitcast <2 x i64> %6 to <4 x i32>
@@ -349,7 +279,7 @@ define <4 x float> @cvt_v2i32_v2f32(ptr) nounwind {
   ret <4 x float> %8
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index 73df6be8d79890..6fe3bc4973185f 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -29,13 +29,13 @@ define i64 @t0(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
 
 define i64 @t1(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t1:
@@ -64,13 +64,13 @@ define i64 @t1(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32)
 
 define i64 @t2(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t2:
@@ -99,13 +99,13 @@ define i64 @t2(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32)
 
 define i64 @t3(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t3:
@@ -134,13 +134,13 @@ define i64 @t3(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32)
 
 define i64 @t4(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t4:
@@ -169,13 +169,13 @@ define i64 @t4(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32)
 
 define i64 @t5(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t5:
@@ -204,13 +204,13 @@ define i64 @t5(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32)
 
 define i64 @t6(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t6:
@@ -239,13 +239,13 @@ define i64 @t6(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32)
 
 define i64 @t7(ptr %a, ptr %b) nounwind {
 ; X86-LABEL: t7:
@@ -274,22 +274,27 @@ define i64 @t7(ptr %a, ptr %b) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a, align 8
+  %0 = load <1 x i64>, ptr %a, align 8
   %1 = load i32, ptr %b, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %0, i32 %1)
-  %3 = bitcast x86_mmx %2 to i64
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %0, i32 %1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32)
 
-define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt0(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt0:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -301,28 +306,34 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt0:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddb (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
 
-define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt1(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt1:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -334,27 +345,33 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt2(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddd (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -366,27 +383,33 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddd (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddd (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
 
-define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt3(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt3:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -398,27 +421,33 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddq (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddq (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>)
 
-define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt4(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt4:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusb (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -430,27 +459,33 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt4:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddusb (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddusb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>)
 
-define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt5(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt5:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    paddusw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -462,27 +497,33 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt5:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    paddusw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    paddusw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt6(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt6:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlw (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -494,27 +535,33 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt6:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrlw (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrlw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>)
 
-define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt7(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt7:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrld (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -526,27 +573,33 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt7:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrld (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrld (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>)
 
-define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
+define i64 @tt8(<1 x i64> %t, ptr %q) nounwind {
 ; X86-LABEL: tt8:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psrlq (%eax), %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -558,18 +611,19 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
 ;
 ; X64-LABEL: tt8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    psrlq (%rdi), %mm0
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    psrlq (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %v = load x86_mmx, ptr %q
-  %u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v)
-  %s = bitcast x86_mmx %u to i64
+  %v = load <1 x i64>, ptr %q
+  %u = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %t, <1 x i64> %v)
+  %s = bitcast <1 x i64> %u to i64
   call void @llvm.x86.mmx.emms()
   ret i64 %s
 }
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>)
 
 define void @test_psrlq_by_volatile_shift_amount(ptr %t) nounwind {
 ; X86-LABEL: test_psrlq_by_volatile_shift_amount:
@@ -599,8 +653,8 @@ entry:
   call void @llvm.lifetime.start(i64 4, ptr nonnull %0)
   store volatile i32 1, ptr %0, align 4
   %1 = load volatile i32, ptr %0, align 4
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx bitcast (<1 x i64> <i64 255> to x86_mmx), i32 %1)
-  store x86_mmx %2, ptr %t, align 8
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> <i64 255>, i32 %1)
+  store <1 x i64> %2, ptr %t, align 8
   call void @llvm.lifetime.end(i64 4, ptr nonnull %0)
   ret void
 }
@@ -609,28 +663,41 @@ declare void @llvm.lifetime.start(i64, ptr nocapture)
 declare void @llvm.lifetime.end(i64, ptr nocapture)
 
 ; Make sure we shrink this vector load and fold it.
-define x86_mmx @vec_load(ptr %x) {
+define <1 x i64> @vec_load(ptr %x) {
 ; X86-LABEL: vec_load:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
 ; X86-NEXT:    paddsb %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vec_load:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
 ; X64-NEXT:    paddsb %mm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
   %z = load <4 x float>, ptr %x
   %y = extractelement <4 x float> %z, i32 0
   %a = insertelement <2 x float> undef, float %y, i32 0
   %b = insertelement <2 x float> %a, float %y, i32 1
-  %c = bitcast <2 x float> %b to x86_mmx
-  %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
-  ret x86_mmx %d
+  %c = bitcast <2 x float> %b to <1 x i64>
+  %d = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %c, <1 x i64> %c)
+  ret <1 x i64> %d
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/mmx-fold-zero.ll b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
index b2c94e3aaa3a65..a6e1275875dbc8 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-zero.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-zero.ll
@@ -115,32 +115,32 @@ define double @mmx_zero(double, double, double, double) nounwind {
 ; X64-LARGE-NEXT:    paddw %mm2, %mm0
 ; X64-LARGE-NEXT:    movq2dq %mm0, %xmm0
 ; X64-LARGE-NEXT:    retq
-  %5 = bitcast double %0 to x86_mmx
-  %6 = bitcast double %1 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6)
-  %8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
-  %9 = bitcast double %2 to x86_mmx
-  %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9)
-  %11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10)
-  %12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11)
-  %13 = bitcast double %3 to x86_mmx
-  %14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13)
-  %15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9)
-  %16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13)
-  %17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10)
-  %18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11)
-  %19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8)
-  %20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7)
-  %21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
-  %22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12)
-  %23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15)
-  %24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6)
-  %25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16)
-  %26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17)
-  %27 = bitcast x86_mmx %26 to double
+  %5 = bitcast double %0 to <1 x i64>
+  %6 = bitcast double %1 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %5, <1 x i64> %6)
+  %8 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %7, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
+  %9 = bitcast double %2 to <1 x i64>
+  %10 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %8, <1 x i64> %9)
+  %11 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %5, <1 x i64> %10)
+  %12 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %6, <1 x i64> %11)
+  %13 = bitcast double %3 to <1 x i64>
+  %14 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %12, <1 x i64> %13)
+  %15 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %14, <1 x i64> %9)
+  %16 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %15, <1 x i64> %13)
+  %17 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %16, <1 x i64> %10)
+  %18 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %17, <1 x i64> %11)
+  %19 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %18, <1 x i64> %8)
+  %20 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %19, <1 x i64> %7)
+  %21 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %20, <1 x i64> bitcast (double 0.000000e+00 to <1 x i64>))
+  %22 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %21, <1 x i64> %12)
+  %23 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %22, <1 x i64> %15)
+  %24 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %23, <1 x i64> %6)
+  %25 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %24, <1 x i64> %16)
+  %26 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %25, <1 x i64> %17)
+  %27 = bitcast <1 x i64> %26 to double
   ret double %27
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/mmx-intrinsics.ll b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
index a43d9400cde6c8..a7b6ed416622ef 100644
--- a/llvm/test/CodeGen/X86/mmx-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefixes=ALL,X64
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+avx | FileCheck %s --check-prefixes=ALL,X64
 
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test1:
@@ -32,24 +32,24 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test88:
@@ -77,24 +77,24 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test88:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test87:
@@ -122,24 +122,24 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test87:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test86:
@@ -167,24 +167,24 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test86:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpgtb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpgtb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test85:
@@ -212,24 +212,24 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test85:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test84:
@@ -257,24 +257,24 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test84:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test83:
@@ -302,24 +302,24 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test83:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pcmpeqb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pcmpeqb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test82:
@@ -347,24 +347,24 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test82:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test81:
@@ -392,24 +392,24 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test81:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test80:
@@ -437,24 +437,24 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test80:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test79:
@@ -482,24 +482,24 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test79:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test78:
@@ -527,24 +527,24 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test78:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test77:
@@ -572,24 +572,24 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test77:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7]
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test76:
@@ -617,24 +617,24 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test76:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packuswb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packuswb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test75:
@@ -662,24 +662,24 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test75:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packssdw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packssdw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test74:
@@ -707,24 +707,24 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test74:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    packsswb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    packsswb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test73:
@@ -754,15 +754,15 @@ define i64 @test73(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test72:
@@ -792,9 +792,9 @@ define i64 @test72(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -825,15 +825,15 @@ define i64 @test72_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test71:
@@ -859,13 +859,13 @@ define i64 @test71(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test70:
@@ -895,9 +895,9 @@ define i64 @test70(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -928,15 +928,15 @@ define i64 @test70_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test69:
@@ -966,15 +966,15 @@ define i64 @test69(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test68:
@@ -1000,13 +1000,13 @@ define i64 @test68(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test67:
@@ -1036,15 +1036,15 @@ define i64 @test67(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test66:
@@ -1074,9 +1074,9 @@ define i64 @test66(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -1107,15 +1107,15 @@ define i64 @test66_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test65:
@@ -1146,17 +1146,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test64:
@@ -1187,17 +1187,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test63:
@@ -1224,15 +1224,15 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test62:
@@ -1263,17 +1263,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test61:
@@ -1304,17 +1304,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test60:
@@ -1341,15 +1341,15 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test59:
@@ -1380,17 +1380,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test58:
@@ -1421,17 +1421,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test56:
@@ -1459,24 +1459,24 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test56:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pxor %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test55:
@@ -1504,24 +1504,24 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test55:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    por %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test54:
@@ -1549,24 +1549,24 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test54:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pandn %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pandn %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test53:
@@ -1594,24 +1594,24 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test53:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pand %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test52:
@@ -1639,18 +1639,18 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test52:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmullw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1682,24 +1682,24 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test51:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmullw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test50:
@@ -1727,24 +1727,24 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test50:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test49:
@@ -1772,24 +1772,24 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test49:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaddwd %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test48:
@@ -1817,24 +1817,24 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test48:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubusw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubusw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test47:
@@ -1862,24 +1862,24 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test47:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubusb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubusb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test46:
@@ -1907,24 +1907,24 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test46:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test45:
@@ -1952,18 +1952,18 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test45:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubsb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubsb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1994,17 +1994,17 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test43:
@@ -2032,24 +2032,24 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test43:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test42:
@@ -2077,24 +2077,24 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test42:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test41:
@@ -2122,24 +2122,24 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test41:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psubb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psubb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test40:
@@ -2167,24 +2167,24 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test40:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddusw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test39:
@@ -2212,24 +2212,24 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test39:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddusb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test38:
@@ -2257,24 +2257,24 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test38:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test37:
@@ -2302,24 +2302,24 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test37:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddsb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test36:
@@ -2346,15 +2346,15 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test35:
@@ -2382,24 +2382,24 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test35:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddd %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test34:
@@ -2427,24 +2427,24 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test34:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test33:
@@ -2472,24 +2472,24 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test33:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    paddb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test32:
@@ -2517,22 +2517,22 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test32:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    psadbw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test31:
@@ -2560,24 +2560,24 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test31:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pminsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test30:
@@ -2605,24 +2605,24 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test30:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pminub %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test29:
@@ -2650,24 +2650,24 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test29:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaxsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test28:
@@ -2695,24 +2695,24 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test28:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmaxub %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test27:
@@ -2740,24 +2740,24 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test27:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pavgw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test26:
@@ -2785,24 +2785,24 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test26:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pavgb %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind
+declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind
 
 define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp {
 ; X86-LABEL: test25:
@@ -2819,12 +2819,12 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind
   ret void
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone
 
 define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test24:
@@ -2850,12 +2850,12 @@ define i32 @test24(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64>
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind
   ret i32 %1
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
 
 define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
 ; X86-LABEL: test23:
@@ -2884,21 +2884,21 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
 ;
 ; X64-LABEL: test23:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    maskmovq %mm1, %mm0
+; X64-NEXT:    maskmovq %mm0, %mm1
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
   %1 = bitcast <1 x i64> %d to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind
   ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test22:
@@ -2926,24 +2926,24 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test22:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhuw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
 define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test21:
@@ -2972,9 +2972,9 @@ define i64 @test21(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -3005,15 +3005,15 @@ define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <2 x i32>
   %5 = extractelement <2 x i32> %4, i32 0
   ret i32 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test20:
@@ -3041,22 +3041,22 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test20:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmuludq %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
 define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test19:
@@ -3081,12 +3081,12 @@ define <2 x double> @test19(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone
   ret <2 x double> %2
 }
 
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test18:
@@ -3109,14 +3109,14 @@ define i64 @test18(<2 x double> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test17:
@@ -3139,14 +3139,14 @@ define i64 @test17(<2 x double> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
 define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test16:
@@ -3173,15 +3173,15 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
 define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test15:
@@ -3210,15 +3210,15 @@ define i64 @test15(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
 define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test14:
@@ -3247,15 +3247,15 @@ define i64 @test14(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
 define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X86-LABEL: test13:
@@ -3284,15 +3284,15 @@ define i64 @test13(<1 x i64> %a) nounwind readnone optsize ssp {
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test12:
@@ -3320,24 +3320,24 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test12:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test11:
@@ -3365,24 +3365,24 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test11:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test10:
@@ -3410,24 +3410,24 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test10:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    psignb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    psignb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test9:
@@ -3455,24 +3455,24 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test9:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pshufb %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pshufb %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test8:
@@ -3500,24 +3500,24 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
 ; X64-NEXT:    pmulhrsw %mm0, %mm1
 ; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test7:
@@ -3545,24 +3545,24 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test7:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    pmaddubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    pmaddubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test6:
@@ -3590,24 +3590,24 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test6:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test5:
@@ -3635,24 +3635,24 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test5:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test4:
@@ -3680,24 +3680,24 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test4:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phsubw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phsubw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test3:
@@ -3725,24 +3725,24 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddsw %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddsw %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ; X86-LABEL: test2:
@@ -3770,33 +3770,49 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdi, %mm0
-; X64-NEXT:    movq %rsi, %mm1
-; X64-NEXT:    phaddd %mm1, %mm0
-; X64-NEXT:    movq %mm0, %rax
+; X64-NEXT:    movq %rsi, %mm0
+; X64-NEXT:    movq %rdi, %mm1
+; X64-NEXT:    phaddd %mm0, %mm1
+; X64-NEXT:    movq %mm1, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
-; ALL-LABEL: test89:
-; ALL:       # %bb.0:
-; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
-; ALL-NEXT:    ret{{[l|q]}}
-  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind {
+; X86-LABEL: test89:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    cvtpi2ps (%esp), %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test89:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %mm0
+; X64-NEXT:    cvtpi2ps %mm0, %xmm0
+; X64-NEXT:    retq
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
 define void @test90() {
 ; ALL-LABEL: test90:
@@ -3836,13 +3852,11 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind {
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2)
-  %2 = bitcast x86_mmx %1 to <1 x i64>
-  ret <1 x i64> %2
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2)
+  ret <1 x i64> %1
 }
 
-declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg)
+declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg)
 
 define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind {
 ; X86-LABEL: test_mm_extract_pi16:
@@ -3867,9 +3881,8 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind {
 ; X64-NEXT:    pextrw $2, %mm0, %eax
 ; X64-NEXT:    retq
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2)
+  %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2)
   ret i32 %1
 }
 
-declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg)
+declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg)
diff --git a/llvm/test/CodeGen/X86/mmx-only.ll b/llvm/test/CodeGen/X86/mmx-only.ll
index eab67e08b95743..8a87350a794294 100644
--- a/llvm/test/CodeGen/X86/mmx-only.ll
+++ b/llvm/test/CodeGen/X86/mmx-only.ll
@@ -3,7 +3,7 @@
 
 ; Test that turning off sse doesn't turn off mmx.
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: @test88
@@ -11,10 +11,10 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
index fd8bd1facaf6b2..6bb564c4b757e6 100644
--- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
@@ -1,18 +1,18 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+mmx,+fma,+f16c,+avx512f -stop-after finalize-isel -o - %s | FileCheck %s
 ; This test ensures that the MXCSR is implicitly used by MMX FP instructions.
 
-define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
+define <1 x i64> @mxcsr_mmx(<4 x float> %a0) {
 ; CHECK: MMX_CVTPS2PIrr %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTPI2PSrr %{{[0-9]}}, killed %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTTPS2PIrr killed %{{[0-9]}}, implicit $mxcsr
 ; CHECK: MMX_CVTPI2PDrr killed %{{[0-9]$}}
 ; CHECK: MMX_CVTPD2PIrr killed %{{[0-9]}}, implicit $mxcsr
-  %1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
-  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %1)
-  %3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2)
-  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %3)
-  %5 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
-  ret x86_mmx %5
+  %1 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %1)
+  %3 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %2)
+  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %3)
+  %5 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
+  ret <1 x i64> %5
 }
 
 define half @mxcsr_f16c(float %a) {
@@ -41,11 +41,11 @@ define <8 x double> @mxcsr_fma_sae(<8 x double> %a, <8 x double> %b, <8 x double
   ret <8 x double> %res
 }
 
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>)
-declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>)
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>)
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>)
+declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>)
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>)
 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
 declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
 declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll
index 1f273eb43c6a60..3b6ffacb0b230e 100644
--- a/llvm/test/CodeGen/X86/nontemporal.ll
+++ b/llvm/test/CodeGen/X86/nontemporal.ll
@@ -193,11 +193,11 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
 ; X64-NEXT:    movntq %mm0, (%rsi)
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %a0
-  %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
-  store x86_mmx %1, ptr %a1, align 8, !nontemporal !0
+  %0 = load <1 x i64>, ptr %a0
+  %1 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 3)
+  store <1 x i64> %1, ptr %a1, align 8, !nontemporal !0
   ret void
 }
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 !0 = !{i32 1}
diff --git a/llvm/test/CodeGen/X86/pr13859.ll b/llvm/test/CodeGen/X86/pr13859.ll
index 9b290e6947d58d..35466478f289b1 100644
--- a/llvm/test/CodeGen/X86/pr13859.ll
+++ b/llvm/test/CodeGen/X86/pr13859.ll
@@ -13,8 +13,7 @@ entry:
   %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
   %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
   %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
-  %a40 = bitcast <4 x i16> %a39 to x86_mmx
-  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+  %a40 = bitcast <4 x i16> %a39 to <1 x i64>
 
   %a47 = trunc i32 %a32 to i1
   br i1 %a47, label %a48, label %a49
@@ -23,6 +22,6 @@ a48:
   unreachable
 
 a49:
-  store <1 x i64> %a41, ptr %dest, align 8 ; !!!
+  store <1 x i64> %a40, ptr %dest, align 8 ; !!!
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll
index 45587b8c69cd40..da3246a917ea3d 100644
--- a/llvm/test/CodeGen/X86/pr23246.ll
+++ b/llvm/test/CodeGen/X86/pr23246.ll
@@ -6,15 +6,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ; PR23246
 ; We're really only interested in doing something sane with the shuffle.
 
-define <2 x i64> @test(x86_mmx %a) #0 {
+define <2 x i64> @test(<1 x i64> %a) #0 {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %rdi, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; CHECK-NEXT:    retq
 entry:
-  %b = bitcast x86_mmx %a to <1 x i64>
-  %s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  %s = shufflevector <1 x i64> %a, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
   ret <2 x i64> %s
 }
 
diff --git a/llvm/test/CodeGen/X86/pr29222.ll b/llvm/test/CodeGen/X86/pr29222.ll
index 9a38515b65594c..9814361404f2d4 100644
--- a/llvm/test/CodeGen/X86/pr29222.ll
+++ b/llvm/test/CodeGen/X86/pr29222.ll
@@ -32,7 +32,7 @@ define i32 @PR29222(i32) nounwind {
 ; X86-AVX-NEXT:    pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
 ; X86-AVX-NEXT:    packsswb %mm0, %mm0
 ; X86-AVX-NEXT:    movq %mm0, (%esp)
-; X86-AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vpbroadcastq (%esp), %xmm0
 ; X86-AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX-NEXT:    movl %ebp, %esp
@@ -60,9 +60,9 @@ define i32 @PR29222(i32) nounwind {
 ; X64-AVX-NEXT:    retq
   %2 = insertelement <2 x i32> undef, i32 %0, i32 0
   %3 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
-  %4 = bitcast <2 x i32> %3 to x86_mmx
-  %5 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %4, x86_mmx %4)
-  %6 = bitcast x86_mmx %5 to i64
+  %4 = bitcast <2 x i32> %3 to <1 x i64>
+  %5 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %4, <1 x i64> %4)
+  %6 = bitcast <1 x i64> %5 to i64
   %7 = insertelement <2 x i64> undef, i64 %6, i32 0
   %8 = bitcast <2 x i64> %7 to <8 x i16>
   %9 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %8, <8 x i16> undef)
@@ -71,5 +71,5 @@ define i32 @PR29222(i32) nounwind {
   ret i32 %11
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>)
 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll
index b6022698edaeb9..0ad35309b87bb4 100644
--- a/llvm/test/CodeGen/X86/pr35982.ll
+++ b/llvm/test/CodeGen/X86/pr35982.ll
@@ -35,9 +35,9 @@ define float @PR35982_emms(<1 x i64>) nounwind {
   %2 = bitcast <1 x i64> %0 to <2 x i32>
   %3 = extractelement <2 x i32> %2, i32 0
   %4 = extractelement <1 x i64> %0, i32 0
-  %5 = bitcast i64 %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %5, x86_mmx %5)
-  %7 = bitcast x86_mmx %6 to <2 x i32>
+  %5 = bitcast i64 %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %5, <1 x i64> %5)
+  %7 = bitcast <1 x i64> %6 to <2 x i32>
   %8 = extractelement <2 x i32> %7, i32 0
   tail call void @llvm.x86.mmx.emms()
   %9 = sitofp i32 %3 to float
@@ -46,5 +46,5 @@ define float @PR35982_emms(<1 x i64>) nounwind {
   ret float %11
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll
index 27b7ebb8381cd3..8a4308a5af64b2 100644
--- a/llvm/test/CodeGen/X86/select-mmx.ll
+++ b/llvm/test/CodeGen/X86/select-mmx.ll
@@ -14,15 +14,11 @@ define i64 @test47(i64 %arg)  {
 ;
 ; X64-LABEL: test47:
 ; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB0_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    pxor %mm0, %mm0
-; X64-NEXT:    jmp .LBB0_3
-; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    movl $7, %eax
-; X64-NEXT:    movd %eax, %mm0
-; X64-NEXT:  .LBB0_3:
+; X64-NEXT:    movl $7, %ecx
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %mm0
 ; X64-NEXT:    psllw %mm0, %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
@@ -35,17 +31,17 @@ define i64 @test47(i64 %arg)  {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    orl 12(%ebp), %eax
-; X86-NEXT:    je .LBB0_1
-; X86-NEXT:  # %bb.2:
-; X86-NEXT:    pxor %mm0, %mm0
-; X86-NEXT:    jmp .LBB0_3
-; X86-NEXT:  .LBB0_1:
 ; X86-NEXT:    movl $7, %eax
-; X86-NEXT:    movd %eax, %mm0
-; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB0_2:
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm0
 ; X86-NEXT:    psllw %mm0, %mm0
 ; X86-NEXT:    movq %mm0, (%esp)
 ; X86-NEXT:    movl (%esp), %eax
@@ -55,9 +51,9 @@ define i64 @test47(i64 %arg)  {
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %cond = icmp eq i64 %arg, 0
-  %slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
-  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
-  %retc = bitcast x86_mmx %psll to i64
+  %slct = select i1 %cond, <1 x i64> bitcast (i64 7 to <1 x i64>), <1 x i64> bitcast (i64 0 to <1 x i64>)
+  %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
+  %retc = bitcast <1 x i64> %psll to i64
   ret i64 %retc
 }
 
@@ -74,13 +70,8 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
 ; X64-LABEL: test49:
 ; X64:       # %bb.0:
 ; X64-NEXT:    testq %rdi, %rdi
-; X64-NEXT:    je .LBB1_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    movq %rdx, %mm0
-; X64-NEXT:    jmp .LBB1_3
-; X64-NEXT:  .LBB1_1:
+; X64-NEXT:    cmovneq %rdx, %rsi
 ; X64-NEXT:    movq %rsi, %mm0
-; X64-NEXT:  .LBB1_3:
 ; X64-NEXT:    psllw %mm0, %mm0
 ; X64-NEXT:    movq %mm0, %rax
 ; X64-NEXT:    retq
@@ -113,13 +104,13 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %cond = icmp eq i64 %arg, 0
-  %xmmx = bitcast i64 %x to x86_mmx
-  %ymmx = bitcast i64 %y to x86_mmx
-  %slct = select i1 %cond, x86_mmx %xmmx, x86_mmx %ymmx
-  %psll = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %slct, x86_mmx %slct)
-  %retc = bitcast x86_mmx %psll to i64
+  %xmmx = bitcast i64 %x to <1 x i64>
+  %ymmx = bitcast i64 %y to <1 x i64>
+  %slct = select i1 %cond, <1 x i64> %xmmx, <1 x i64> %ymmx
+  %psll = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %slct, <1 x i64> %slct)
+  %retc = bitcast <1 x i64> %psll to i64
   ret i64 %retc
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
index 11ca9e2a547eef..6eb99dd6c67582 100644
--- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+ssse3 | FileCheck %s
 
-define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
+define <1 x i64> @stack_fold_cvtpd2pi(<2 x double> %a0) {
 ; CHECK-LABEL: stack_fold_cvtpd2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9,45 +9,47 @@ define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
+define <2 x double> @stack_fold_cvtpi2pd(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_cvtpi2pd:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpi2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %a0) nounwind readnone
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %a0) nounwind readnone
   ret <2 x double> %2
 }
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
-define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
+define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_cvtpi2ps:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtpi2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %a1) nounwind readnone
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, <1 x i64> %a1) nounwind readnone
   ret <4 x float> %2
 }
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
+define <1 x i64> @stack_fold_cvtps2pi(<4 x float> %a0) {
 ; CHECK-LABEL: stack_fold_cvtps2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -55,15 +57,15 @@ define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvtps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
 
-define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
+define <1 x i64> @stack_fold_cvttpd2pi(<2 x double> %a0) {
 ; CHECK-LABEL: stack_fold_cvttpd2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -71,15 +73,15 @@ define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvttpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
-define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
+define <1 x i64> @stack_fold_cvttps2pi(<4 x float> %a0) {
 ; CHECK-LABEL: stack_fold_cvttps2pi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -87,18 +89,18 @@ define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    cvttps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
-  ret x86_mmx %2
+  %2 = call <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
 
 ; TODO stack_fold_movd_load
 
 ; padd forces execution on mmx
-define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
+define i32 @stack_fold_movd_store(<1 x i64> %a0) nounwind {
 ; CHECK-LABEL: stack_fold_movd_store:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
@@ -107,6 +109,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movd %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    #APP
@@ -120,8 +123,8 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
-  %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0)
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = extractelement <2 x i32> %2, i32 0
   %4 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i32 %3
@@ -130,7 +133,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
 ; TODO stack_fold_movq_load
 
 ; padd forces execution on mmx
-define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
+define i64 @stack_fold_movq_store(<1 x i64> %a0) nounwind {
 ; CHECK-LABEL: stack_fold_movq_store:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
@@ -139,6 +142,7 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    paddb %mm0, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
@@ -152,1171 +156,1235 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
 ; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
-  %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a0)
-  %2 = bitcast x86_mmx %1 to i64
+  %1 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a0, <1 x i64> %a0)
+  %2 = bitcast <1 x i64> %1 to i64
   %3 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
   ret i64 %2
 }
 
-define x86_mmx @stack_fold_pabsb(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsb(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsb:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pabsd(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsd(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsd:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pabsw(x86_mmx %a0) {
+define <1 x i64> @stack_fold_pabsw(<1 x i64> %a0) {
 ; CHECK-LABEL: stack_fold_pabsw:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %a0) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packssdw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packssdw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packssdw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packsswb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packsswb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packsswb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packsswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_packuswb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_packuswb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    packuswb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    packuswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddsb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddsb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddsb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddusb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddusb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddusb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddusw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddusw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddusw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_paddw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_paddw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    paddw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    paddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_palignr(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_palignr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    palignr $1, %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %a, <1 x i64> %b, i8 1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
-define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pand(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pand:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pand %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pand {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pandn(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pandn:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pandn %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pavgb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pavgb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pavgb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pavgb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pavgw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pavgw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pavgw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pavgw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpeqw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpeqw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpeqw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pcmpgtw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pcmpgtw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pcmpgtw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phaddw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phaddw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phaddw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phaddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_phsubw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_phsubw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    phsubw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    phsubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 ; TODO stack_fold_pinsrw
 
-define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaddubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaddubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaddubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaddwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaddwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaddwd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaxsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaxsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaxsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmaxub(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmaxub:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmaxub %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pminsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pminsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pminsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pminsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pminub(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pminub:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pminub %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pminub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhrsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhrsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhrsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhuw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhuw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhuw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmulhw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmulhw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmulhw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmullw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmullw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmullw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmullw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pmuludq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pmuludq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pmuludq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_por(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_por:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    por %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psadbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psadbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psadbw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psadbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pshufb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pshufb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pshufb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload
-; CHECK-NEXT:    pshufb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pshufw(x86_mmx %a) {
+define <1 x i64> @stack_fold_pshufw(<1 x i64> %a) {
 ; CHECK-LABEL: stack_fold_pshufw:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %mm0
 ; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    pshufw $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    # mm0 = mem[1,0,0,0]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
+; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %a, i8 1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
-define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignb(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignd(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) {
+define <1 x i64> @stack_fold_psignw(<1 x i64> %a0, <1 x i64> %a1) {
 ; CHECK-LABEL: stack_fold_psignw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psignw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psignw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %a0, <1 x i64> %a1) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pslld(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pslld:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pslld %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pslld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psllq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psllq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psllq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psllq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psllw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psllw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psllw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psllw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrad(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrad:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrad %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrad {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psraw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psraw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psraw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psraw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrld(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrld:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrld %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrlq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrlq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrlq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrlq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psrlw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psrlw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psrlw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psrlw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubd %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubq %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubsb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubsb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubsb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubsw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubsw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubsw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubusb(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubusb:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubusb %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubusw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubusw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubusw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_psubw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_psubw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    psubw %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    psubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhdq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhdq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[1],mem[1]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckhwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckhwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpcklbw(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpcklbw:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpckldq(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpckldq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_punpcklwd(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_punpcklwd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mm0[0],mem[0],mm0[1],mem[1]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
-define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
+define <1 x i64> @stack_fold_pxor(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: stack_fold_pxor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %mm0
+; CHECK-NEXT:    movq %rdi, %mm1
+; CHECK-NEXT:    pxor %mm0, %mm1
+; CHECK-NEXT:    movq %mm1, %rax
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pxor {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
+  %1 = tail call <1 x i64> asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %a, <1 x i64> %b) nounwind readnone
+  ret <1 x i64> %2
 }
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
index 672b4591316ce8..cd375c04168818 100644
--- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
@@ -20,9 +20,9 @@ define i32 @test0(ptr %v4) nounwind {
 entry:
   %v5 = load <1 x i64>, ptr %v4, align 8
   %v12 = bitcast <1 x i64> %v5 to <4 x i16>
-  %v13 = bitcast <4 x i16> %v12 to x86_mmx
-  %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
-  %v15 = bitcast x86_mmx %v14 to <4 x i16>
+  %v13 = bitcast <4 x i16> %v12 to <1 x i64>
+  %v14 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %v13, i8 -18)
+  %v15 = bitcast <1 x i64> %v14 to <4 x i16>
   %v16 = bitcast <4 x i16> %v15 to <1 x i64>
   %v17 = extractelement <1 x i64> %v16, i32 0
   %v18 = bitcast i64 %v17 to <2 x i32>
@@ -52,12 +52,12 @@ entry:
   %0 = load i32, ptr %ptr, align 4
   %1 = insertelement <2 x i32> undef, i32 %0, i32 0
   %2 = insertelement <2 x i32> %1, i32 0, i32 1
-  %3 = bitcast <2 x i32> %2 to x86_mmx
-  %4 = bitcast x86_mmx %3 to i64
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
+  %4 = bitcast <1 x i64> %3 to i64
   %5 = bitcast i64 %4 to <4 x i16>
-  %6 = bitcast <4 x i16> %5 to x86_mmx
-  %7 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %6, i8 -24)
-  %8 = bitcast x86_mmx %7 to <4 x i16>
+  %6 = bitcast <4 x i16> %5 to <1 x i64>
+  %7 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %6, i8 -24)
+  %8 = bitcast <1 x i64> %7 to <4 x i16>
   %9 = bitcast <4 x i16> %8 to <1 x i64>
   %10 = extractelement <1 x i64> %9, i32 0
   %11 = bitcast i64 %10 to <2 x i32>
@@ -82,9 +82,9 @@ define i32 @test2(ptr nocapture readonly %ptr) nounwind {
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
-  %0 = load x86_mmx, ptr %ptr, align 8
-  %1 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %0, i8 -24)
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %0 = load <1 x i64>, ptr %ptr, align 8
+  %1 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %0, i8 -24)
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   %5 = bitcast i64 %4 to <2 x i32>
@@ -93,40 +93,39 @@ entry:
   ret i32 %6
 }
 
-define i32 @test3(x86_mmx %a) nounwind {
+define i32 @test3(<1 x i64> %a) nounwind {
 ; X86-LABEL: test3:
 ; X86:       # %bb.0:
-; X86-NEXT:    movd %mm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %mm0, %eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast x86_mmx %a to <2 x i32>
+  %tmp0 = bitcast <1 x i64> %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 0
   ret i32 %tmp1
 }
 
 ; Verify we don't muck with extractelts from the upper lane.
-define i32 @test4(x86_mmx %a) nounwind {
+define i32 @test4(<1 x i64> %a) nounwind {
 ; X86-LABEL: test4:
 ; X86:       # %bb.0:
-; X86-NEXT:    movq2dq %mm0, %xmm0
-; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq %rdi, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X64-NEXT:    movd %xmm0, %eax
 ; X64-NEXT:    retq
-  %tmp0 = bitcast x86_mmx %a to <2 x i32>
+  %tmp0 = bitcast <1 x i64> %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 1
   ret i32 %tmp1
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8)
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index aa8a394f33a8bb..91743898545ee1 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -25,8 +25,8 @@ define void  @t1(i32 %a, ptr %P) nounwind {
  %tmp12 = shl i32 %a, 12
  %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
  %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
- %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
- store x86_mmx %tmp23, ptr %P
+ %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
+ store <1 x i64> %tmp23, ptr %P
  ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll
index cea047453de43e..67473febf28c77 100644
--- a/llvm/test/CodeGen/X86/vec_insert-7.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-7.ll
@@ -5,21 +5,20 @@
 ; MMX insertelement is not available; these are promoted to xmm.
 ; (Without SSE they are split to two ints, and the code is much better.)
 
-define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
+define <1 x i64> @mmx_movzl(<1 x i64> %x) nounwind {
 ; X86-LABEL: mmx_movzl:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl $32, %eax
-; X86-NEXT:    movd %eax, %mm0
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mmx_movzl:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    movq %rax, %xmm0
 ; X64-NEXT:    retq
-  %tmp = bitcast x86_mmx %x to <2 x i32>
+  %tmp = bitcast <1 x i64> %x to <2 x i32>
   %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
   %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1
-  %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
-  ret x86_mmx %tmp9
+  %tmp9 = bitcast <2 x i32> %tmp8 to <1 x i64>
+  ret <1 x i64> %tmp9
 }
diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
index f561a2a20e194f..f95b34685211d7 100644
--- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
@@ -3,22 +3,22 @@
 ; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64
 
 ; This is not an MMX operation; promoted to xmm.
-define x86_mmx @t0(i32 %A) nounwind {
+define <1 x i64> @t0(i32 %A) nounwind {
 ; X86-LABEL: t0:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm1
-; X86-NEXT:    pxor %mm0, %mm0
-; X86-NEXT:    punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t0:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    movd %edi, %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X64-NEXT:    psllq $32, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
 ; X64-NEXT:    retq
   %tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
-  %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
-  ret x86_mmx %tmp4
+  %tmp4 = bitcast <2 x i32> %tmp3 to <1 x i64>
+  ret <1 x i64> %tmp4
 }
 
 define <8 x i8> @t1(i8 zeroext %x) nounwind {
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
index 709be6534d777d..60800673ed2dd4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -52,9 +52,9 @@ entry:
   %tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16>
   %tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 >
   %tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8>
-  %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
-  %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, ptr null)
+  %tmp556 = bitcast <8 x i8> %tmp555 to <1 x i64>
+  %tmp557 = bitcast <8 x i8> zeroinitializer to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq( <1 x i64> %tmp557, <1 x i64> %tmp556, ptr null)
   ret void
 }
 
@@ -115,19 +115,19 @@ define <4 x float> @pr35869() nounwind {
 ; X64-NEXT:    punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
 ; X64-NEXT:    cvtpi2ps %mm0, %xmm0
 ; X64-NEXT:    retq
-  %1 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to x86_mmx), x86_mmx bitcast (<8 x i8> zeroinitializer to x86_mmx))
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx bitcast (<4 x i16> zeroinitializer to x86_mmx), x86_mmx %1)
-  %3 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %1, x86_mmx %2)
-  %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, x86_mmx %3)
+  %1 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> bitcast (<8 x i8> <i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> to <1 x i64>), <1 x i64> bitcast (<8 x i8> zeroinitializer to <1 x i64>))
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> bitcast (<4 x i16> zeroinitializer to <1 x i64>), <1 x i64> %1)
+  %3 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %1, <1 x i64> %2)
+  %4 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> zeroinitializer, <1 x i64> %3)
   %5 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %6 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %1, x86_mmx %2)
-  %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, x86_mmx %6)
+  %6 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %1, <1 x i64> %2)
+  %7 = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %5, <1 x i64> %6)
   ret <4 x float> %7
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr)
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx)
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx)
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr)
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>)
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/x86-64-psub.ll b/llvm/test/CodeGen/X86/x86-64-psub.ll
index 9817d798fd4bf8..4c11464075ec92 100644
--- a/llvm/test/CodeGen/X86/x86-64-psub.ll
+++ b/llvm/test/CodeGen/X86/x86-64-psub.ll
@@ -32,11 +32,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -66,11 +66,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -100,11 +100,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <2 x i32>
-  %3 = bitcast <2 x i32> %2 to x86_mmx
+  %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <2 x i32>
-  %5 = bitcast <2 x i32> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <2 x i32>
+  %5 = bitcast <2 x i32> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <2 x i32>
   %8 = bitcast <2 x i32> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -134,11 +134,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -168,11 +168,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -202,11 +202,11 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <8 x i8>
-  %3 = bitcast <8 x i8> %2 to x86_mmx
+  %3 = bitcast <8 x i8> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <8 x i8>
-  %5 = bitcast <8 x i8> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <8 x i8>
+  %5 = bitcast <8 x i8> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <8 x i8>
   %8 = bitcast <8 x i8> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
@@ -236,26 +236,26 @@ entry:
   %__m1.0.insert.i = insertelement <1 x i64> undef, i64 %0, i32 0
   %__m2.0.insert.i = insertelement <1 x i64> undef, i64 %1, i32 0
   %2 = bitcast <1 x i64> %__m1.0.insert.i to <4 x i16>
-  %3 = bitcast <4 x i16> %2 to x86_mmx
+  %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = bitcast <1 x i64> %__m2.0.insert.i to <4 x i16>
-  %5 = bitcast <4 x i16> %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %3, x86_mmx %5) nounwind
-  %7 = bitcast x86_mmx %6 to <4 x i16>
+  %5 = bitcast <4 x i16> %4 to <1 x i64>
+  %6 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %3, <1 x i64> %5) nounwind
+  %7 = bitcast <1 x i64> %6 to <4 x i16>
   %8 = bitcast <4 x i16> %7 to <1 x i64>
   %retval.0.extract.i15 = extractelement <1 x i64> %8, i32 0
   ret i64 %retval.0.extract.i15
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 39b2b6225d8b10..1d2e38eb5e63d8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test1(
@@ -17,16 +17,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -35,16 +35,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test88(
@@ -57,16 +57,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -75,16 +75,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test87(
@@ -97,16 +97,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -115,16 +115,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test86(
@@ -137,16 +137,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -155,16 +155,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test85(
@@ -177,16 +177,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -195,16 +195,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test84(
@@ -217,16 +217,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -235,16 +235,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test83(
@@ -257,16 +257,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -275,16 +275,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test82(
@@ -297,16 +297,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -315,16 +315,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test81(
@@ -337,16 +337,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -355,16 +355,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test80(
@@ -377,16 +377,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -395,16 +395,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test79(
@@ -417,16 +417,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -435,16 +435,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test78(
@@ -457,16 +457,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -475,16 +475,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test77(
@@ -497,16 +497,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -515,16 +515,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test76(
@@ -537,23 +537,22 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -564,16 +563,16 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test75(
@@ -586,23 +585,22 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <2 x i1> [[TMP12]] to <2 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -613,16 +611,16 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test74(
@@ -635,23 +633,22 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -662,16 +659,16 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test73(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test73(
@@ -681,17 +678,15 @@ define i64 @test73(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -699,15 +694,15 @@ define i64 @test73(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test72(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test72(
@@ -717,17 +712,15 @@ define i64 @test72(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -735,9 +728,9 @@ define i64 @test72(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -751,17 +744,15 @@ define i64 @test72_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -769,15 +760,15 @@ define i64 @test72_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test71(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test71(
@@ -787,25 +778,25 @@ define i64 @test71(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test70(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test70(
@@ -815,17 +806,15 @@ define i64 @test70(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -833,9 +822,9 @@ define i64 @test70(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -849,17 +838,15 @@ define i64 @test70_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -867,15 +854,15 @@ define i64 @test70_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test69(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test69(
@@ -885,17 +872,15 @@ define i64 @test69(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -903,15 +888,15 @@ define i64 @test69(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone
 
 define i64 @test68(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test68(
@@ -921,25 +906,25 @@ define i64 @test68(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to i64
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to i64
   ret i64 %2
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64>, i32) nounwind readnone
 
 define i64 @test67(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test67(
@@ -949,17 +934,15 @@ define i64 @test67(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -967,15 +950,15 @@ define i64 @test67(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <2 x i32>
   %3 = bitcast <2 x i32> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64>, i32) nounwind readnone
 
 define i64 @test66(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test66(
@@ -985,17 +968,15 @@ define i64 @test66(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1003,9 +984,9 @@ define i64 @test66(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 3) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 3) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
@@ -1019,17 +1000,15 @@ define i64 @test66_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT:    [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1037,15 +1016,15 @@ define i64 @test66_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %mmx_var.i, i32 0) nounwind
-  %2 = bitcast x86_mmx %1 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> %mmx_var.i, i32 0) nounwind
+  %2 = bitcast <1 x i64> %1 to <4 x i16>
   %3 = bitcast <4 x i16> %2 to <1 x i64>
   %4 = extractelement <1 x i64> %3, i32 0
   ret i64 %4
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test65(
@@ -1056,20 +1035,21 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1079,17 +1059,17 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test64(
@@ -1100,20 +1080,21 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1123,17 +1104,17 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test63(
@@ -1144,32 +1125,35 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
 ; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test62(
@@ -1180,20 +1164,21 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1203,17 +1188,17 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test61(
@@ -1224,20 +1209,21 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1247,17 +1233,17 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test60(
@@ -1268,32 +1254,35 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
 ; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP5]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test59(
@@ -1304,20 +1293,21 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1327,17 +1317,17 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %0 to x86_mmx
+  %mmx_var.i = bitcast <2 x i32> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test58(
@@ -1348,20 +1338,21 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1371,17 +1362,17 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %0 to x86_mmx
+  %mmx_var.i = bitcast <4 x i16> %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1.i = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var1.i = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pxor(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test56(
@@ -1394,16 +1385,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1412,16 +1403,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test55(
@@ -1434,16 +1425,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1452,16 +1443,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pandn(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test54(
@@ -1474,16 +1465,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1492,16 +1483,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test53(
@@ -1514,16 +1505,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1532,16 +1523,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test52(
@@ -1554,16 +1545,16 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1572,10 +1563,10 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1592,16 +1583,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1610,16 +1601,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test50(
@@ -1632,16 +1623,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1650,16 +1641,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test49(
@@ -1672,38 +1663,38 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i64 [[TMP12]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP18]]
 ;
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test48(
@@ -1716,16 +1707,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1734,16 +1725,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test47(
@@ -1756,16 +1747,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1774,16 +1765,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test46(
@@ -1796,16 +1787,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1814,16 +1805,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test45(
@@ -1836,16 +1827,16 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1854,10 +1845,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -1872,29 +1863,32 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test43(
@@ -1907,16 +1901,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1925,16 +1919,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test42(
@@ -1947,16 +1941,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1965,16 +1959,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test41(
@@ -1987,16 +1981,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2005,16 +1999,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test40(
@@ -2027,16 +2021,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2045,16 +2039,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test39(
@@ -2067,16 +2061,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2085,16 +2079,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test38(
@@ -2107,16 +2101,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2125,16 +2119,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test37(
@@ -2147,16 +2141,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2165,16 +2159,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test36(
@@ -2185,27 +2179,30 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %mmx_var, x86_mmx %mmx_var1)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> %mmx_var, <1 x i64> %mmx_var1)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test35(
@@ -2218,16 +2215,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2236,16 +2233,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test34(
@@ -2258,16 +2255,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2276,16 +2273,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test33(
@@ -2298,16 +2295,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2316,16 +2313,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test32(
@@ -2338,30 +2335,33 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP10]], 48
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i64 [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <1 x i64> [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP14]] to i64
+; CHECK-NEXT:    store i64 [[TMP15]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test31(
@@ -2374,16 +2374,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2392,16 +2392,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test30(
@@ -2414,16 +2414,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2432,16 +2432,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test29(
@@ -2454,16 +2454,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2472,16 +2472,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test28(
@@ -2494,16 +2494,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2512,16 +2512,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test27(
@@ -2534,16 +2534,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2552,16 +2552,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test26(
@@ -2574,16 +2574,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2592,16 +2592,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare void @llvm.x86.mmx.movnt.dq(ptr, x86_mmx) nounwind
+declare void @llvm.x86.mmx.movnt.dq(ptr, <1 x i64>) nounwind
 
 define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test25(
@@ -2612,26 +2612,29 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    store <1 x i64> [[TMP3]], ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK:       8:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], <1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var.i = bitcast i64 %0 to x86_mmx
-  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast i64 %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.movnt.dq(ptr %p, <1 x i64> %mmx_var.i) nounwind
   ret void
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>) nounwind readnone
 
 define i32 @test24(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i32 @test24(
@@ -2641,26 +2644,27 @@ define i32 @test24(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP6]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %0 to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %mmx_var.i) nounwind
+  %mmx_var.i = bitcast <8 x i8> %0 to <1 x i64>
+  %1 = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) nounwind
   ret i32 %1
 }
 
-declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, ptr) nounwind
+declare void @llvm.x86.mmx.maskmovq(<1 x i64>, <1 x i64>, ptr) nounwind
 
 define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test23(
@@ -2674,33 +2678,35 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[D]] to <8 x i8>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP9]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK:       9:
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       11:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    tail call void @llvm.x86.mmx.maskmovq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
+; CHECK:       12:
+; CHECK-NEXT:    tail call void @llvm.x86.mmx.maskmovq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = bitcast <1 x i64> %n to <8 x i8>
   %1 = bitcast <1 x i64> %d to <8 x i8>
-  %mmx_var.i = bitcast <8 x i8> %1 to x86_mmx
-  %mmx_var1.i = bitcast <8 x i8> %0 to x86_mmx
-  tail call void @llvm.x86.mmx.maskmovq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i, ptr %p) nounwind
+  %mmx_var.i = bitcast <8 x i8> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <8 x i8> %0 to <1 x i64>
+  tail call void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) nounwind
   ret void
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test22(
@@ -2713,16 +2719,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2731,16 +2737,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %mmx_var.i = bitcast <4 x i16> %1 to x86_mmx
-  %mmx_var1.i = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %mmx_var.i = bitcast <4 x i16> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8) nounwind readnone
 
 define i64 @test21(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test21(
@@ -2750,16 +2756,17 @@ define i64 @test21(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
@@ -2767,9 +2774,9 @@ define i64 @test21(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
@@ -2783,16 +2790,17 @@ define i32 @test21_2(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -2800,15 +2808,15 @@ define i32 @test21_2(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 3) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <2 x i32>
   %5 = extractelement <2 x i32> %4, i32 0
   ret i32 %5
 }
 
-declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test20(
@@ -2821,27 +2829,28 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
-; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
-; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %mmx_var.i = bitcast <2 x i32> %1 to x86_mmx
-  %mmx_var1.i = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %mmx_var.i, x86_mmx %mmx_var1.i) nounwind
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var.i = bitcast <2 x i32> %1 to <1 x i64>
+  %mmx_var1.i = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i) nounwind
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>) nounwind readnone
 
 define <2 x double> @test19(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define <2 x double> @test19(
@@ -2851,26 +2860,27 @@ define <2 x double> @test19(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP7]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> [[TMP8]]) #[[ATTR5]]
 ; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %1) nounwind readnone
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %1) nounwind readnone
   ret <2 x double> %2
 }
 
-declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test18(<2 x double> %a) #0 {
 ; CHECK-LABEL: define i64 @test18(
@@ -2885,22 +2895,22 @@ define i64 @test18(<2 x double> %a) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       3:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
 
 define i64 @test17(<2 x double> %a) #0 {
 ; CHECK-LABEL: define i64 @test17(
@@ -2915,22 +2925,22 @@ define i64 @test17(<2 x double> %a) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       3:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
-  %1 = bitcast x86_mmx %0 to <2 x i32>
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) nounwind readnone
+  %1 = bitcast <1 x i64> %0 to <2 x i32>
   %2 = bitcast <2 x i32> %1 to <1 x i64>
   %3 = extractelement <1 x i64> %2, i32 0
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64>, <1 x i64>, i8) nounwind readnone
 
 define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test16(
@@ -2941,34 +2951,38 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT:    [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP12]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]], i8 16)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]], i8 16)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %0 = extractelement <1 x i64> %a, i32 0
-  %mmx_var = bitcast i64 %0 to x86_mmx
+  %mmx_var = bitcast i64 %0 to <1 x i64>
   %1 = extractelement <1 x i64> %b, i32 0
-  %mmx_var1 = bitcast i64 %1 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %mmx_var, x86_mmx %mmx_var1, i8 16)
-  %3 = bitcast x86_mmx %2 to i64
+  %mmx_var1 = bitcast i64 %1 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> %mmx_var, <1 x i64> %mmx_var1, i8 16)
+  %3 = bitcast <1 x i64> %2 to i64
   ret i64 %3
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64>) nounwind readnone
 
 define i64 @test15(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test15(
@@ -2978,13 +2992,13 @@ define i64 @test15(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2992,15 +3006,15 @@ define i64 @test15(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <2 x i32>
   %4 = bitcast <2 x i32> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64>) nounwind readnone
 
 define i64 @test14(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test14(
@@ -3010,13 +3024,13 @@ define i64 @test14(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3024,15 +3038,15 @@ define i64 @test14(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <4 x i16>
-  %1 = bitcast <4 x i16> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <4 x i16>
   %4 = bitcast <4 x i16> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64>) nounwind readnone
 
 define i64 @test13(<1 x i64> %a) #0 {
 ; CHECK-LABEL: define i64 @test13(
@@ -3042,13 +3056,13 @@ define i64 @test13(<1 x i64> %a) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <8 x i8>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <8 x i8>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3056,15 +3070,15 @@ define i64 @test13(<1 x i64> %a) #0 {
 ;
 entry:
   %0 = bitcast <1 x i64> %a to <8 x i8>
-  %1 = bitcast <8 x i8> %0 to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1) nounwind readnone
-  %3 = bitcast x86_mmx %2 to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to <1 x i64>
+  %2 = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> %1) nounwind readnone
+  %3 = bitcast <1 x i64> %2 to <8 x i8>
   %4 = bitcast <8 x i8> %3 to <1 x i64>
   %5 = extractelement <1 x i64> %4, i32 0
   ret i64 %5
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test12(
@@ -3077,16 +3091,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3095,16 +3109,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test11(
@@ -3117,16 +3131,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3135,16 +3149,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test10(
@@ -3157,16 +3171,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3175,16 +3189,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test9(
@@ -3197,16 +3211,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3215,16 +3229,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test8(
@@ -3237,16 +3251,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3255,16 +3269,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test7(
@@ -3277,18 +3291,18 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP21]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <4 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP14]] to <8 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -3299,16 +3313,16 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <8 x i8>
   %1 = bitcast <1 x i64> %a to <8 x i8>
-  %2 = bitcast <8 x i8> %1 to x86_mmx
-  %3 = bitcast <8 x i8> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <8 x i8>
+  %2 = bitcast <8 x i8> %1 to <1 x i64>
+  %3 = bitcast <8 x i8> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <8 x i8>
   %6 = bitcast <8 x i8> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test6(
@@ -3321,16 +3335,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3339,16 +3353,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test5(
@@ -3361,16 +3375,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3379,16 +3393,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test4(
@@ -3401,16 +3415,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3419,16 +3433,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test3(
@@ -3441,16 +3455,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3459,16 +3473,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <4 x i16>
   %1 = bitcast <1 x i64> %a to <4 x i16>
-  %2 = bitcast <4 x i16> %1 to x86_mmx
-  %3 = bitcast <4 x i16> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <4 x i16>
+  %2 = bitcast <4 x i16> %1 to <1 x i64>
+  %3 = bitcast <4 x i16> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <4 x i16>
   %6 = bitcast <4 x i16> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64>, <1 x i64>) nounwind readnone
 
 define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test2(
@@ -3481,16 +3495,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3499,43 +3513,44 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 entry:
   %0 = bitcast <1 x i64> %b to <2 x i32>
   %1 = bitcast <1 x i64> %a to <2 x i32>
-  %2 = bitcast <2 x i32> %1 to x86_mmx
-  %3 = bitcast <2 x i32> %0 to x86_mmx
-  %4 = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %2, x86_mmx %3) nounwind readnone
-  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %2 = bitcast <2 x i32> %1 to <1 x i64>
+  %3 = bitcast <2 x i32> %0 to <1 x i64>
+  %4 = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> %2, <1 x i64> %3) nounwind readnone
+  %5 = bitcast <1 x i64> %4 to <2 x i32>
   %6 = bitcast <2 x i32> %5 to <1 x i64>
   %7 = extractelement <1 x i64> %6, i32 0
   ret i64 %7
 }
 
-define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 {
+define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind #0 {
 ; ALL-LABEL: test89:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    cvtpi2ps %mm0, %xmm0
 ; ALL-NEXT:    ret{{[l|q]}}
 ; CHECK-LABEL: define <4 x float> @test89(
-; CHECK-SAME: <4 x float> [[A:%.*]], x86_mmx [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], x86_mmx [[B]])
+; CHECK:       6:
+; CHECK-NEXT:    [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], <1 x i64> [[B]])
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[C]]
 ;
-  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
+  %c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, <1 x i64> %b)
   ret <4 x float> %c
 }
 
-declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>) nounwind readnone
 
 define void @test90() #0 {
 ; ALL-LABEL: test90:
@@ -3562,28 +3577,24 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx [[TMP0]], i32 [[D]], i32 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to <1 x i64>
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[A_COERCE]], i32 [[D]], i32 2)
 ; CHECK-NEXT:    store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <1 x i64> [[TMP2]]
+; CHECK-NEXT:    ret <1 x i64> [[TMP9]]
 ;
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %0, i32 %d, i32 2)
-  %2 = bitcast x86_mmx %1 to <1 x i64>
-  ret <1 x i64> %2
+  %1 = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> %a.coerce, i32 %d, i32 2)
+  ret <1 x i64> %1
 }
 
-declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32 immarg)
+declare <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64>, i32, i32 immarg)
 
 define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
 ; CHECK-LABEL: define i32 @test_mm_extract_pi16(
@@ -3592,24 +3603,22 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
+; CHECK:       2:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx [[TMP0]], i32 2)
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[A_COERCE]], i32 2)
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
-  %0 = bitcast <1 x i64> %a.coerce to x86_mmx
-  %1 = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx %0, i32 2)
+  %1 = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> %a.coerce, i32 2)
   ret i32 %1
 }
 
-declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32 immarg)
+declare i32 @llvm.x86.mmx.pextr.w(<1 x i64>, i32 immarg)
 
 attributes #0 = { sanitize_memory }
 ;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index 34c3ca3706eeb5..1e412cbac84c2a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt %s -S -passes=msan 2>&1 | FileCheck %s
 ; new test from upstream
-; XFAIL: * 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 93d89b40bce343..9da7f01806a7a1 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -5,9 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
-declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) nounwind readnone
 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
-declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory {
 entry:
@@ -23,19 +23,19 @@ entry:
 ; CHECK: ret <4 x i32>
 
 
-define x86_mmx @Test_ssse3_pmadd_ub_sw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
-; CHECK: or i64
-; CHECK: bitcast i64 {{.*}} to <4 x i16>
+; CHECK: or <1 x i64>
+; CHECK: bitcast <1 x i64> {{.*}} to <4 x i16>
 ; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK: bitcast <4 x i16> {{.*}} to i64
-; CHECK: ret x86_mmx
+; CHECK: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK: ret <1 x i64>
 
 
 define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
@@ -52,15 +52,15 @@ define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_mem
 ; CHECK: ret <2 x i64>
 
 
-define x86_mmx @Test_x86_mmx_psad_bw(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_x86_mmx_psad_bw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_x86_mmx_psad_bw(
-; CHECK: or i64
+; CHECK: or <1 x i64>
 ; CHECK: icmp ne i64
 ; CHECK: sext i1 {{.*}} to i64
 ; CHECK: lshr i64 {{.*}}, 48
-; CHECK: ret x86_mmx
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
index 52acbfe0a0e779..e9202700b1df74 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+declare <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
 declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 
 ; Single argument vector conversion.
@@ -27,12 +27,12 @@ entry:
 ; CHECK: store i32 0, {{.*}} @__msan_retval_tls
 ; CHECK: ret i32
 
-; x86_mmx packed vector conversion.
+; <1 x i64> packed vector conversion.
 
-define x86_mmx @test_cvtps2pi(<4 x float> %value) sanitize_memory {
+define <1 x i64> @test_cvtps2pi(<4 x float> %value) sanitize_memory {
 entry:
-  %0 = tail call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %value)
-  ret x86_mmx %0
+  %0 = tail call <1 x i64> @llvm.x86.sse.cvtps2pi(<4 x float> %value)
+  ret <1 x i64> %0
 }
 
 ; CHECK-LABEL: @test_cvtps2pi
@@ -42,9 +42,9 @@ entry:
 ; CHECK: icmp ne {{.*}}[[S]], 0
 ; CHECK: br
 ; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
-; CHECK: store i64 0, {{.*}} @__msan_retval_tls
-; CHECK: ret x86_mmx
+; CHECK: call <1 x i64> @llvm.x86.sse.cvtps2pi
+; CHECK: store <1 x i64> zeroinitializer, {{.*}} @__msan_retval_tls
+; CHECK: ret <1 x i64>
 
 ; avx512 rounding conversion.
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 775c29791aefed..0f6f1fe4a7dcad 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b) nounwind readnone
-declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+declare <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64>, <1 x i64>) nounwind readnone
 
 define <8 x i16> @Test_packssdw_128(<4 x i32> %a, <4 x i32> %b) sanitize_memory {
 entry:
@@ -40,22 +40,21 @@ entry:
 ; CHECK: ret <32 x i8>
 
 
-define x86_mmx @Test_mmx_packuswb(x86_mmx %a, x86_mmx %b) sanitize_memory {
+define <1 x i64> @Test_mmx_packuswb(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 entry:
-  %c = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind
-  ret x86_mmx %c
+  %c = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> %a, <1 x i64> %b) nounwind
+  ret <1 x i64> %c
 }
 
 ; CHECK-LABEL: @Test_mmx_packuswb(
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
 ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
 ; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
 ; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}}
-; CHECK-DAG: bitcast x86_mmx {{.*}} to i64
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}}
-; CHECK: ret x86_mmx
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packsswb({{.*}}
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packuswb({{.*}}
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 7514e6ea74bb46..461d6cb9217d8f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx)
+declare <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64>, <1 x i64>)
 declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>)
 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>)
 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>)
@@ -18,10 +18,10 @@ declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32)
 define i64 @test_mmx(i64 %x.coerce, i64 %y.coerce) sanitize_memory {
 entry:
   %0 = bitcast i64 %x.coerce to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = bitcast i64 %y.coerce to x86_mmx
-  %3 = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %1, x86_mmx %2)
-  %4 = bitcast x86_mmx %3 to <2 x i32>
+  %1 = bitcast <2 x i32> %0 to <1 x i64>
+  %2 = bitcast i64 %y.coerce to <1 x i64>
+  %3 = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> %1, <1 x i64> %2)
+  %4 = bitcast <1 x i64> %3 to <2 x i32>
   %5 = bitcast <2 x i32> %4 to <1 x i64>
   %6 = extractelement <1 x i64> %5, i32 0
   ret i64 %6
@@ -29,11 +29,11 @@ entry:
 
 ; CHECK-LABEL: @test_mmx
 ; CHECK: = icmp ne i64 {{.*}}, 0
-; CHECK: [[C:%.*]] = sext i1 {{.*}} to i64
-; CHECK: [[A:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(
-; CHECK: [[B:%.*]] = bitcast x86_mmx {{.*}}[[A]] to i64
-; CHECK: = or i64 {{.*}}[[B]], {{.*}}[[C]]
-; CHECK: call x86_mmx @llvm.x86.mmx.psll.d(
+; CHECK: [[B:%.*]] = sext i1 {{.*}} to i64
+; CHECK: [[C:%.*]] = bitcast i64 [[B]] to <1 x i64>
+; CHECK: [[A:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(
+; CHECK: = or <1 x i64> {{.*}}[[A]], {{.*}}[[C]]
+; CHECK: call <1 x i64> @llvm.x86.mmx.psll.d(
 ; CHECK: ret i64
 
 
diff --git a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
index 990e2810d851e4..2e9da9f2b5b779 100644
--- a/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
+++ b/llvm/test/MC/X86/x86-GCC-inline-asm-Y-constraints.ll
@@ -11,7 +11,7 @@ define void @f_Ym(i64 %m.coerce) {
 ; CHECK:         ## InlineAsm End
 
 entry:
-  %0 = tail call x86_mmx asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() 
+  %0 = tail call <1 x i64> asm sideeffect "movq $0, %mm1\0A\09", "=^Ym,~{dirflag},~{fpsr},~{flags}"() 
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
index 63114288fc5810..76e2f9af2ed1de 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -7,12 +7,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; DemandedBits - MOVMSK zeros the upper bits of the result.
 ;
 
-define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
+define i32 @test_upper_x86_mmx_pmovmskb(<1 x i64> %a0) {
 ; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[A0:%.*]])
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0)
   %2 = and i32 %1, 255
   ret i32 %2
 }
@@ -87,11 +87,11 @@ define i32 @test_upper_x86_avx_movmsk_pd_256(<4 x double> %a0) {
 ; DemandedBits - If we don't use the lower bits then we just return zero.
 ;
 
-define i32 @test_lower_x86_mmx_pmovmskb(x86_mmx %a0) {
+define i32 @test_lower_x86_mmx_pmovmskb(<1 x i64> %a0) {
 ; CHECK-LABEL: @test_lower_x86_mmx_pmovmskb(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %a0)
   %2 = and i32 %1, -256
   ret i32 %2
 }
@@ -151,7 +151,7 @@ define i32 @undef_x86_mmx_pmovmskb() {
 ; CHECK-LABEL: @undef_x86_mmx_pmovmskb(
 ; CHECK-NEXT:    ret i32 0
 ;
-  %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx undef)
+  %1 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> undef)
   ret i32 %1
 }
 
@@ -207,16 +207,6 @@ define i32 @undef_x86_avx2_pmovmskb() {
 ; Constant Folding (ZERO -> ZERO)
 ;
 
-define i32 @zero_x86_mmx_pmovmskb() {
-; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
-; CHECK-NEXT:    ret i32 [[TMP1]]
-;
-  %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
-  ret i32 %2
-}
-
 define i32 @zero_x86_sse_movmsk_ps() {
 ; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
 ; CHECK-NEXT:    ret i32 0
@@ -271,11 +261,11 @@ define i32 @zero_x86_avx2_pmovmskb() {
 
 define i32 @fold_x86_mmx_pmovmskb() {
 ; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> splat (i64 18084223940296448))
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
-  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
-  %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
+  %1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to <1 x i64>
+  %2 = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %1)
   ret i32 %2
 }
 
@@ -447,7 +437,7 @@ define i32 @sext_sse_movmsk_ps_must_replicate_bits(<2 x i1> %x) {
   ret i32 %r
 }
 
-declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx)
+declare i32 @llvm.x86.mmx.pmovmskb(<1 x i64>)
 
 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>)
 declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>)
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
index 38a7391a1a1e37..d4ec9e3aae6795 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
@@ -38,38 +38,6 @@ define <1 x i64> @d(i64 %y) {
   ret <1 x i64> %c
 }
 
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    ret x86_mmx [[C]]
-;
-  %c = bitcast <1 x i64> %y to x86_mmx
-  ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    ret <1 x i64> [[C]]
-;
-  %c = bitcast x86_mmx %y to <1 x i64>
-  ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT:    ret double [[TMP0]]
-;
-entry:
-  %0 = bitcast x86_mmx %x to <1 x i64>
-  %1 = bitcast <1 x i64> %0 to double
-  ret double %1
-}
-
 ; FP source is ok.
 
 define <3 x i64> @bitcast_inselt_undef(double %x, i32 %idx) {
@@ -137,19 +105,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
   ret <3 x i64> %i
 }
 
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT:    [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT:    ret <3 x i64> [[I]]
-;
-  %xb = bitcast x86_mmx %x to i64
-  %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
-  ret <3 x i64> %i
-}
-
 ; Reduce number of casts
 
 define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
index 8b8325b1472637..f787b3c4cc9ac2 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
@@ -38,37 +38,6 @@ define <1 x i64> @d(i64 %y) {
   ret <1 x i64> %c
 }
 
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT:    ret x86_mmx [[C]]
-;
-  %c = bitcast <1 x i64> %y to x86_mmx
-  ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    ret <1 x i64> [[C]]
-;
-  %c = bitcast x86_mmx %y to <1 x i64>
-  ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT:    ret double [[TMP0]]
-;
-entry:
-  %0 = bitcast x86_mmx %x to <1 x i64>
-  %1 = bitcast <1 x i64> %0 to double
-  ret double %1
-}
 
 ; FP source is ok.
 
@@ -137,19 +106,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
   ret <3 x i64> %i
 }
 
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT:    [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT:    [[I:%.*]] = insertelement <3 x i64> undef, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT:    ret <3 x i64> [[I]]
-;
-  %xb = bitcast x86_mmx %x to i64
-  %i = insertelement <3 x i64> undef, i64 %xb, i32 %idx
-  ret <3 x i64> %i
-}
-
 ; Reduce number of casts
 
 define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
index 1534d5a4b3aaeb..ca748a9483e9b2 100644
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -937,27 +937,6 @@ define float @test2c() {
   ret float extractelement (<2 x float> bitcast (double bitcast (<2 x float> <float -1.000000e+00, float -1.000000e+00> to double) to <2 x float>), i32 0)
 }
 
-define i64 @test_mmx(<2 x i32> %x) {
-; ALL-LABEL: @test_mmx(
-; ALL-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64
-; ALL-NEXT:    ret i64 [[C]]
-;
-  %A = bitcast <2 x i32> %x to x86_mmx
-  %B = bitcast x86_mmx %A to <2 x i32>
-  %C = bitcast <2 x i32> %B to i64
-  ret i64 %C
-}
-
-define i64 @test_mmx_const(<2 x i32> %c) {
-; ALL-LABEL: @test_mmx_const(
-; ALL-NEXT:    ret i64 0
-;
-  %A = bitcast <2 x i32> zeroinitializer to x86_mmx
-  %B = bitcast x86_mmx %A to <2 x i32>
-  %C = bitcast <2 x i32> %B to i64
-  ret i64 %C
-}
-
 ; PR12514
 define i1 @test67(i1 %a, i32 %b) {
 ; ALL-LABEL: @test67(
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
index bce07b07756209..c383ff7a90ded2 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep-zeroinit-vector.ll
@@ -12,18 +12,5 @@ define <2 x ptr> @test_gep() {
 ; CHECK-NEXT:    ret <2 x ptr> <ptr @a, ptr @a>
 ;
   %A = getelementptr [1 x %rec8], ptr @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer
-  %B = bitcast <2 x ptr> %A to <2 x ptr>
-  ret <2 x ptr> %B
-}
-
-; Testcase that verify the cast-of-cast when the outer/second cast is to a
-; vector type.
-
-define <4 x i16> @test_mmx_const() {
-; CHECK-LABEL: @test_mmx_const(
-; CHECK-NEXT:    ret <4 x i16> zeroinitializer
-;
-  %A = bitcast <2 x i32> zeroinitializer to x86_mmx
-  %B = bitcast x86_mmx %A to <4 x i16>
-  ret <4 x i16> %B
+  ret <2 x ptr> %A
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
index d4c49faf91b091..dd75560e25ceda 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
@@ -335,19 +335,6 @@ define { i64, i64 } @test_load_struct() {
   ret { i64, i64 } %v
 }
 
-@m64 = internal constant [2 x i64] zeroinitializer
-@idx = external global i32
-
-; This should not try to create an x86_mmx null value.
-define x86_mmx @load_mmx() {
-; CHECK-LABEL: @load_mmx(
-; CHECK-NEXT:    [[TEMP:%.*]] = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)), align 8
-; CHECK-NEXT:    ret x86_mmx [[TEMP]]
-;
-  %temp = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64))
-  ret x86_mmx %temp
-}
-
 @g_offset = external global i64
 
 @g_neg_one_vec = constant <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>
diff --git a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll b/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
deleted file mode 100644
index b460b79d0640aa..00000000000000
--- a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define x86_mmx @f() #0 {
-; CHECK-LABEL: define x86_mmx @f
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[PHI]], 7
-; CHECK-NEXT:    [[ADD_7]] = add i32 [[PHI]], 8
-; CHECK-NEXT:    [[CMP_7:%.*]] = icmp eq i32 [[ADD_6]], 0
-; CHECK-NEXT:    br i1 [[CMP_7]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[RET:%.*]] = phi x86_mmx [ undef, [[FOR_BODY]] ]
-; CHECK-NEXT:    ret x86_mmx [[RET]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %phi = phi i32 [ 1, %entry ], [ %add, %for.body ]
-  %add = add i32 %phi, 1
-  %cmp = icmp eq i32 %phi, 0
-  br i1 %cmp, label %exit, label %for.body
-
-exit:                                             ; preds = %for.body
-  %ret = phi x86_mmx [ undef, %for.body ]
-  ret x86_mmx %ret
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
diff --git a/llvm/test/Transforms/SCCP/crash.ll b/llvm/test/Transforms/SCCP/crash.ll
index 8f8ad44db437b0..47d9329f6f03da 100644
--- a/llvm/test/Transforms/SCCP/crash.ll
+++ b/llvm/test/Transforms/SCCP/crash.ll
@@ -28,7 +28,7 @@ define i32 @test2([4 x i32] %A) {
   ret i32 %B
 }
 
-define x86_mmx @test3() {
-  %load = load x86_mmx, ptr null
-  ret x86_mmx %load
+define <1 x i64> @test3() {
+  %load = load <1 x i64>, ptr null
+  ret <1 x i64> %load
 }
diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll
index 1bf1ad7ee934a5..4eb6a7107dad30 100644
--- a/llvm/test/Transforms/SROA/pr57796.ll
+++ b/llvm/test/Transforms/SROA/pr57796.ll
@@ -17,9 +17,9 @@ define void @foo() {
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32
 ; CHECK-NEXT:    [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0)
-; CHECK-NEXT:    store x86_mmx [[TMP2]], ptr @A, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP1]], i8 0)
+; CHECK-NEXT:    store <1 x i64> [[TMP2]], ptr @A, align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -29,13 +29,13 @@ entry:
   %call.i = call align 32 ptr @value_set_type(ptr align 32 %ref.tmp.i)
   %0 = load <32 x i8>, ptr %call.i, align 32
   store <32 x i8> %0, ptr %ref.tmp, align 32
-  %1 = load x86_mmx, ptr %ref.tmp, align 32
-  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 0)
-  store x86_mmx %2, ptr @A, align 8
+  %1 = load <1 x i64>, ptr %ref.tmp, align 32
+  %2 = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %1, i8 0)
+  store <1 x i64> %2, ptr @A, align 8
   ret void
 }
 
-declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8 immarg)
+declare <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64>, i8 immarg)
 
 declare dso_local void @value_create(ptr sret(%struct.Value) align 32)
 
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index fe70ba082cb4c1..f835b98b243456 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -3,12 +3,12 @@
 ; CHECK: atomic store operand must have integer, pointer, or floating point type!
 ; CHECK: atomic load operand must have integer, pointer, or floating point type!
 
-define void @foo(ptr %P, x86_mmx %v) {
-  store atomic x86_mmx %v, ptr %P unordered, align 8
+define void @foo(ptr %P, <1 x i64> %v) {
+  store atomic <1 x i64> %v, ptr %P unordered, align 8
   ret void
 }
 
-define x86_mmx @bar(ptr %P) {
-  %v = load atomic x86_mmx, ptr %P unordered, align 8
-  ret x86_mmx %v
+define <1 x i64> @bar(ptr %P) {
+  %v = load atomic <1 x i64>, ptr %P unordered, align 8
+  ret <1 x i64> %v
 }
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index 09d4421ea8a407..4173e49e60a046 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -153,8 +153,6 @@ struct TypeCloner {
         return LLVMMetadataTypeInContext(Ctx);
       case LLVMX86_AMXTypeKind:
         return LLVMX86AMXTypeInContext(Ctx);
-      case LLVMX86_MMXTypeKind:
-        return LLVMX86MMXTypeInContext(Ctx);
       case LLVMTokenTypeKind:
         return LLVMTokenTypeInContext(Ctx);
       case LLVMTargetExtTypeKind: {
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 277938a611fd0e..e44b6023fff231 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -173,8 +173,6 @@ struct Modifier {
         Ty = Type::getX86_FP80Ty(Context);
       else if (Arg == "ppc_fp128")
         Ty = Type::getPPC_FP128Ty(Context);
-      else if (Arg == "x86_mmx")
-        Ty = Type::getX86_MMXTy(Context);
       else if (Arg.starts_with("i")) {
         unsigned N = 0;
         Arg.drop_front().getAsInteger(10, N);
@@ -294,11 +292,7 @@ struct Modifier {
   /// Pick a random vector type.
   Type *pickVectorType(VectorType *VTy = nullptr) {
 
-    // Vectors of x86mmx are illegal; keep trying till we get something else.
-    Type *Ty;
-    do {
-      Ty = pickScalarType();
-    } while (Ty->isX86_MMXTy());
+    Type *Ty = pickScalarType();
 
     if (VTy)
       return VectorType::get(Ty, VTy->getElementCount());
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index f517761efa3c0b..b730d665c78132 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -205,7 +205,6 @@ TEST(InstructionsTest, CastInst) {
   Type *Int64Ty = Type::getInt64Ty(C);
   Type *V8x8Ty = FixedVectorType::get(Int8Ty, 8);
   Type *V8x64Ty = FixedVectorType::get(Int64Ty, 8);
-  Type *X86MMXTy = Type::getX86_MMXTy(C);
 
   Type *HalfTy = Type::getHalfTy(C);
   Type *FloatTy = Type::getFloatTy(C);
@@ -248,9 +247,6 @@ TEST(InstructionsTest, CastInst) {
   EXPECT_EQ(CastInst::Trunc, CastInst::getCastOpcode(c64, true, V8x8Ty, true));
   EXPECT_EQ(CastInst::SExt, CastInst::getCastOpcode(c8, true, V8x64Ty, true));
 
-  EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, X86MMXTy));
-  EXPECT_FALSE(CastInst::isBitCastable(X86MMXTy, V8x8Ty));
-  EXPECT_FALSE(CastInst::isBitCastable(Int64Ty, X86MMXTy));
   EXPECT_FALSE(CastInst::isBitCastable(V8x64Ty, V8x8Ty));
   EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, V8x64Ty));
 
@@ -1809,7 +1805,7 @@ TEST(InstructionsTest, AllocaInst) {
         %A = alloca i32, i32 1
         %B = alloca i32, i32 4
         %C = alloca i32, i32 %n
-        %D = alloca <8 x double>
+        %D = alloca double
         %E = alloca <vscale x 8 x double>
         %F = alloca [2 x half]
         %G = alloca [2 x [3 x i128]]
@@ -1835,7 +1831,8 @@ TEST(InstructionsTest, AllocaInst) {
   EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
   EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::getFixed(128));
   EXPECT_FALSE(C.getAllocationSizeInBits(DL));
-  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(512));
+  EXPECT_EQ(DL.getTypeSizeInBits(D.getAllocatedType()), TypeSize::getFixed(64));
+  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(64));
   EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::getScalable(512));
   EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
   EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::getFixed(768));
diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
index bc0f484108facf..fadc81b567b4e4 100644
--- a/mlir/docs/Dialects/LLVM.md
+++ b/mlir/docs/Dialects/LLVM.md
@@ -240,8 +240,6 @@ dialect as there is no corresponding built-in type.
 The following non-parametric types derived from the LLVM IR are available in the
 LLVM dialect:
 
--   `!llvm.x86_mmx` (`LLVMX86MMXType`) - value held in an MMX register on x86
-    machine.
 -   `!llvm.ppc_fp128` (`LLVMPPCFP128Type`) - 128-bit floating-point value (two
     64 bits).
 -   `!llvm.token` (`LLVMTokenType`) - a non-inspectable value associated with an
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
index f3d211b4aebb17..2ea589a7c4c3bd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
@@ -67,7 +67,6 @@ namespace LLVM {
 
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType, "llvm.void");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type, "llvm.ppc_fp128");
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType, "llvm.x86_mmx");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType, "llvm.token");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType, "llvm.label");
 DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata");
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 929918e5c3e76f..6b2d8943bf4885 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -3527,7 +3527,6 @@ void LLVMDialect::initialize() {
   // clang-format off
   addTypes<LLVMVoidType,
            LLVMPPCFP128Type,
-           LLVMX86MMXType,
            LLVMTokenType,
            LLVMLabelType,
            LLVMMetadataType,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
index 9e5fb9d6b81326..9537f7c40dd4be 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
@@ -35,7 +35,6 @@ static StringRef getTypeKeyword(Type type) {
   return TypeSwitch<Type, StringRef>(type)
       .Case<LLVMVoidType>([&](Type) { return "void"; })
       .Case<LLVMPPCFP128Type>([&](Type) { return "ppc_fp128"; })
-      .Case<LLVMX86MMXType>([&](Type) { return "x86_mmx"; })
       .Case<LLVMTokenType>([&](Type) { return "token"; })
       .Case<LLVMLabelType>([&](Type) { return "label"; })
       .Case<LLVMMetadataType>([&](Type) { return "metadata"; })
@@ -310,7 +309,6 @@ static Type dispatchParse(AsmParser &parser, bool allowAny = true) {
   return StringSwitch<function_ref<Type()>>(key)
       .Case("void", [&] { return LLVMVoidType::get(ctx); })
       .Case("ppc_fp128", [&] { return LLVMPPCFP128Type::get(ctx); })
-      .Case("x86_mmx", [&] { return LLVMX86MMXType::get(ctx); })
       .Case("token", [&] { return LLVMTokenType::get(ctx); })
       .Case("label", [&] { return LLVMLabelType::get(ctx); })
       .Case("metadata", [&] { return LLVMMetadataType::get(ctx); })
diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
index c00214ede206dd..ea990ca7aefbe0 100644
--- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
@@ -67,8 +67,6 @@ class TypeFromLLVMIRTranslatorImpl {
       return LLVM::LLVMX86AMXType::get(&context);
     if (type->isPPC_FP128Ty())
       return LLVM::LLVMPPCFP128Type::get(&context);
-    if (type->isX86_MMXTy())
-      return LLVM::LLVMX86MMXType::get(&context);
     if (type->isLabelTy())
       return LLVM::LLVMLabelType::get(&context);
     if (type->isMetadataTy())
diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
index be200acac28e7e..c7a533eddce84b 100644
--- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
@@ -58,9 +58,6 @@ class TypeToLLVMIRTranslatorImpl {
             .Case([this](LLVM::LLVMPPCFP128Type) {
               return llvm::Type::getPPC_FP128Ty(context);
             })
-            .Case([this](LLVM::LLVMX86MMXType) {
-              return llvm::Type::getX86_MMXTy(context);
-            })
             .Case([this](LLVM::LLVMTokenType) {
               return llvm::Type::getTokenTy(context);
             })
diff --git a/offload/hostexec/services/execute_service.cpp b/offload/hostexec/services/execute_service.cpp
index 489f57e737a975..2e0e6741b456e8 100644
--- a/offload/hostexec/services/execute_service.cpp
+++ b/offload/hostexec/services/execute_service.cpp
@@ -417,7 +417,6 @@ enum TypeID {
   VoidTyID,      ///< type with no size
   LabelTyID,     ///< Labels
   MetadataTyID,  ///< Metadata
-  X86_MMXTyID,   ///< MMX vectors (64 bits, X86 specific)
   X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
   TokenTyID,     ///< Tokens
 
@@ -528,7 +527,6 @@ static service_rc hostrpc_pfBuildValist(hostrpc_ValistExt_t *valist,
     case StructTyID:         ///< 13: Structures
     case FunctionTyID:       ///< 12: Functions
     case TokenTyID:          ///< 10: Tokens
-    case X86_MMXTyID:        ///<  9: MMX vectors (64 bits, X86 specific)
     case MetadataTyID:       ///<  8: Metadata
     case LabelTyID:          ///<  7: Labels
     case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
@@ -766,7 +764,6 @@ static service_rc hostrpc_build_vargs_array(int NumArgs, char *keyptr,
     case StructTyID:         ///< 13: Structures
     case FunctionTyID:       ///< 12: Functions
     case TokenTyID:          ///< 10: Tokens
-    case X86_MMXTyID:        ///<  9: MMX vectors (64 bits, X86 specific)
     case MetadataTyID:       ///<  8: Metadata
     case LabelTyID:          ///<  7: Labels
     case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
diff --git a/revert_patches.txt b/revert_patches.txt
index 85a42628e187c8..710f0990bf25fa 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -1,9 +1,4 @@
 ---
-Revert: breaks devIO for openmp
-dfeb3991fb48 Remove the `x86_mmx` IR type. (#98505)
-b7e4fba6e5dc Cleanup x86_mmx after removing IR type  (#100646) (Reason: dependent on dfeb3991fb48)
-Ron: still broken 11-1-24
----
 Revert: breaks build of hipCUB
 commit 55783bd0 [HIP] fix host min/max in header (#82956)
 Sam