From c588f51f481565f615880c8a9736b4fe0abda18f Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Wed, 29 Jan 2025 12:23:15 +0100
Subject: [PATCH] x86-sse2 ABI: use SSE registers for floats and SIMD

---
 compiler/rustc_target/src/callconv/mod.rs     | 118 +++++++++++-------
 compiler/rustc_target/src/callconv/x86.rs     |  14 ++-
 .../closure-inherit-target-feature.rs         |   3 +-
 tests/assembly/x86-return-float.rs            | 111 ++++++++--------
 tests/codegen/abi-x86-sse.rs                  |  36 ++++++
 tests/codegen/intrinsics/transmute-x64.rs     |   7 +-
 .../simd-intrinsic-transmute-array.rs         |   4 +-
 tests/codegen/simd/packed-simd.rs             |  25 ++--
 ...e-abi-checks.rs => sse-simd-abi-checks.rs} |   5 +-
 ...ecks.stderr => sse-simd-abi-checks.stderr} |   4 +-
 10 files changed, 205 insertions(+), 122 deletions(-)
 create mode 100644 tests/codegen/abi-x86-sse.rs
 rename tests/ui/{sse-abi-checks.rs => sse-simd-abi-checks.rs} (82%)
 rename tests/ui/{sse-abi-checks.stderr => sse-simd-abi-checks.stderr} (94%)

diff --git a/compiler/rustc_target/src/callconv/mod.rs b/compiler/rustc_target/src/callconv/mod.rs
index 9e651376cd7ce..0707db6674b4a 100644
--- a/compiler/rustc_target/src/callconv/mod.rs
+++ b/compiler/rustc_target/src/callconv/mod.rs
@@ -9,7 +9,7 @@ pub use rustc_abi::{Primitive, Reg, RegKind};
 use rustc_macros::HashStable_Generic;
 use rustc_span::Symbol;
 
-use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, WasmCAbi};
+use crate::spec::{HasTargetSpec, HasWasmCAbiOpt, HasX86AbiOpt, RustcAbi, WasmCAbi};
 
 mod aarch64;
 mod amdgpu;
@@ -388,6 +388,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
     /// Pass this argument directly instead. Should NOT be used!
     /// Only exists because of past ABI mistakes that will take time to fix
     /// (see <https://github.com/rust-lang/rust/issues/115666>).
+    #[track_caller]
     pub fn make_direct_deprecated(&mut self) {
         match self.mode {
             PassMode::Indirect { .. } => {
@@ -400,6 +401,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
 
     /// Pass this argument indirectly, by passing a (thin or wide) pointer to the argument instead.
     /// This is valid for both sized and unsized arguments.
+    #[track_caller]
     pub fn make_indirect(&mut self) {
         match self.mode {
             PassMode::Direct(_) | PassMode::Pair(_, _) => {
@@ -414,6 +416,7 @@ impl<'a, Ty> ArgAbi<'a, Ty> {
 
     /// Same as `make_indirect`, but for arguments that are ignored. Only needed for ABIs that pass
     /// ZSTs indirectly.
+    #[track_caller]
     pub fn make_indirect_from_ignore(&mut self) {
         match self.mode {
             PassMode::Ignore => {
@@ -737,7 +740,7 @@ impl<'a, Ty> FnAbi<'a, Ty> {
         C: HasDataLayout + HasTargetSpec,
     {
         let spec = cx.target_spec();
-        match &spec.arch[..] {
+        match &*spec.arch {
             "x86" => x86::compute_rust_abi_info(cx, self, abi),
             "riscv32" | "riscv64" => riscv::compute_rust_abi_info(cx, self, abi),
             "loongarch64" => loongarch::compute_rust_abi_info(cx, self, abi),
@@ -745,6 +748,22 @@ impl<'a, Ty> FnAbi<'a, Ty> {
             _ => {}
         };
 
+        // Decides whether we can pass the given SIMD argument via `PassMode::Direct`.
+        // May only return `true` if the target will always pass those arguments the same way,
+        // no matter what the user does with `-Ctarget-feature`! In other words, whatever
+        // target features are required to pass a SIMD value in registers must be listed in
+        // the `abi_required_features` for the current target and ABI.
+        let can_pass_simd_directly = |arg: &ArgAbi<'_, Ty>| match &*spec.arch {
+            // On x86, if we have SSE2 (which we have by default for x86_64), we can always pass up
+            // to 128-bit-sized vectors.
+            "x86" if spec.rustc_abi == Some(RustcAbi::X86Sse2) => arg.layout.size.bits() <= 128,
+            "x86_64" if spec.rustc_abi != Some(RustcAbi::X86Softfloat) => {
+                arg.layout.size.bits() <= 128
+            }
+            // So far, we haven't implemented this logic for any other target.
+            _ => false,
+        };
+
         for (arg_idx, arg) in self
             .args
             .iter_mut()
@@ -752,12 +771,15 @@ impl<'a, Ty> FnAbi<'a, Ty> {
             .map(|(idx, arg)| (Some(idx), arg))
             .chain(iter::once((None, &mut self.ret)))
         {
-            if arg.is_ignore() {
+            // If the logic above already picked a specific type to cast the argument to, leave that
+            // in place.
+            if matches!(arg.mode, PassMode::Ignore | PassMode::Cast { .. }) {
                 continue;
             }
 
             if arg_idx.is_none()
                 && arg.layout.size > Primitive::Pointer(AddressSpace::DATA).size(cx) * 2
+                && !matches!(arg.layout.backend_repr, BackendRepr::Vector { .. })
             {
                 // Return values larger than 2 registers using a return area
                 // pointer. LLVM and Cranelift disagree about how to return
@@ -767,7 +789,8 @@ impl<'a, Ty> FnAbi<'a, Ty> {
                 // return value independently and decide to pass it in a
                 // register or not, which would result in the return value
                 // being passed partially in registers and partially through a
-                // return area pointer.
+                // return area pointer. For large IR-level values such as `i128`,
+                // cranelift will even split up the value into smaller chunks.
                 //
                 // While Cranelift may need to be fixed as the LLVM behavior is
                 // generally more correct with respect to the surface language,
@@ -797,53 +820,60 @@ impl<'a, Ty> FnAbi<'a, Ty> {
                 // rustc_target already ensure any return value which doesn't
                 // fit in the available amount of return registers is passed in
                 // the right way for the current target.
+                //
+                // The adjustment is not necessary nor desired for types with a vector
+                // representation; those are handled below.
                 arg.make_indirect();
                 continue;
             }
 
             match arg.layout.backend_repr {
-                BackendRepr::Memory { .. } => {}
-
-                // This is a fun case! The gist of what this is doing is
-                // that we want callers and callees to always agree on the
-                // ABI of how they pass SIMD arguments. If we were to *not*
-                // make these arguments indirect then they'd be immediates
-                // in LLVM, which means that they'd used whatever the
-                // appropriate ABI is for the callee and the caller. That
-                // means, for example, if the caller doesn't have AVX
-                // enabled but the callee does, then passing an AVX argument
-                // across this boundary would cause corrupt data to show up.
-                //
-                // This problem is fixed by unconditionally passing SIMD
-                // arguments through memory between callers and callees
-                // which should get them all to agree on ABI regardless of
-                // target feature sets. Some more information about this
-                // issue can be found in #44367.
-                //
-                // Note that the intrinsic ABI is exempt here as
-                // that's how we connect up to LLVM and it's unstable
-                // anyway, we control all calls to it in libstd.
-                BackendRepr::Vector { .. }
-                    if abi != ExternAbi::RustIntrinsic && spec.simd_types_indirect =>
-                {
-                    arg.make_indirect();
-                    continue;
+                BackendRepr::Memory { .. } => {
+                    // Compute `Aggregate` ABI.
+
+                    let is_indirect_not_on_stack =
+                        matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
+                    assert!(is_indirect_not_on_stack);
+
+                    let size = arg.layout.size;
+                    if arg.layout.is_sized()
+                        && size <= Primitive::Pointer(AddressSpace::DATA).size(cx)
+                    {
+                        // We want to pass small aggregates as immediates, but using
+                        // an LLVM aggregate type for this leads to bad optimizations,
+                        // so we pick an appropriately sized integer type instead.
+                        arg.cast_to(Reg { kind: RegKind::Integer, size });
+                    }
                 }
 
-                _ => continue,
-            }
-            // Compute `Aggregate` ABI.
-
-            let is_indirect_not_on_stack =
-                matches!(arg.mode, PassMode::Indirect { on_stack: false, .. });
-            assert!(is_indirect_not_on_stack);
-
-            let size = arg.layout.size;
-            if !arg.layout.is_unsized() && size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
-                // We want to pass small aggregates as immediates, but using
-                // an LLVM aggregate type for this leads to bad optimizations,
-                // so we pick an appropriately sized integer type instead.
-                arg.cast_to(Reg { kind: RegKind::Integer, size });
+                BackendRepr::Vector { .. } => {
+                    // This is a fun case! The gist of what this is doing is
+                    // that we want callers and callees to always agree on the
+                    // ABI of how they pass SIMD arguments. If we were to *not*
+                    // make these arguments indirect then they'd be immediates
+                    // in LLVM, which means that they'd used whatever the
+                    // appropriate ABI is for the callee and the caller. That
+                    // means, for example, if the caller doesn't have AVX
+                    // enabled but the callee does, then passing an AVX argument
+                    // across this boundary would cause corrupt data to show up.
+                    //
+                    // This problem is fixed by unconditionally passing SIMD
+                    // arguments through memory between callers and callees
+                    // which should get them all to agree on ABI regardless of
+                    // target feature sets. Some more information about this
+                    // issue can be found in #44367.
+                    //
+                    // Note that the intrinsic ABI is exempt here as those are not
+                    // real functions anyway, and the backend expects very specific types.
+                    if abi != ExternAbi::RustIntrinsic
+                        && spec.simd_types_indirect
+                        && !can_pass_simd_directly(arg)
+                    {
+                        arg.make_indirect();
+                    }
+                }
+
+                _ => {}
             }
         }
     }
diff --git a/compiler/rustc_target/src/callconv/x86.rs b/compiler/rustc_target/src/callconv/x86.rs
index 7c88d9b55cfee..ce368199873c5 100644
--- a/compiler/rustc_target/src/callconv/x86.rs
+++ b/compiler/rustc_target/src/callconv/x86.rs
@@ -4,7 +4,7 @@ use rustc_abi::{
 };
 
 use crate::abi::call::{ArgAttribute, FnAbi, PassMode};
-use crate::spec::HasTargetSpec;
+use crate::spec::{HasTargetSpec, RustcAbi};
 
 #[derive(PartialEq)]
 pub(crate) enum Flavor {
@@ -236,8 +236,16 @@ where
             _ => false, // anyway not passed via registers on x86
         };
         if has_float {
-            if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
-                // Same size or smaller than pointer, return in a register.
+            if cx.target_spec().rustc_abi == Some(RustcAbi::X86Sse2)
+                && fn_abi.ret.layout.backend_repr.is_scalar()
+                && fn_abi.ret.layout.size.bits() <= 128
+            {
+                // This is a single scalar that fits into an SSE register, and the target uses the
+                // SSE ABI. We prefer this over integer registers as float scalars need to be in SSE
+                // registers for float operations, so that's the best place to pass them around.
+                fn_abi.ret.cast_to(Reg { kind: RegKind::Vector, size: fn_abi.ret.layout.size });
+            } else if fn_abi.ret.layout.size <= Primitive::Pointer(AddressSpace::DATA).size(cx) {
+                // Same size or smaller than pointer, return in an integer register.
                 fn_abi.ret.cast_to(Reg { kind: RegKind::Integer, size: fn_abi.ret.layout.size });
             } else {
                 // Larger than a pointer, return indirectly.
diff --git a/tests/assembly/closure-inherit-target-feature.rs b/tests/assembly/closure-inherit-target-feature.rs
index 4692653d91fc6..9f73820f0ea1f 100644
--- a/tests/assembly/closure-inherit-target-feature.rs
+++ b/tests/assembly/closure-inherit-target-feature.rs
@@ -15,9 +15,8 @@ pub unsafe fn sse41_blend_nofeature(x: __m128, y: __m128) -> __m128 {
         // check that _mm_blend_ps is not being inlined into the closure
         // CHECK-LABEL: {{sse41_blend_nofeature.*closure.*:}}
         // CHECK-NOT: blendps
-        // CHECK: {{call .*_mm_blend_ps.*}}
+        // CHECK: {{jmp .*_mm_blend_ps.*}}
         // CHECK-NOT: blendps
-        // CHECK: ret
         #[inline(never)]
         |x, y| _mm_blend_ps(x, y, 0b0101)
     };
diff --git a/tests/assembly/x86-return-float.rs b/tests/assembly/x86-return-float.rs
index acd1af8d38af1..0802116bf61d1 100644
--- a/tests/assembly/x86-return-float.rs
+++ b/tests/assembly/x86-return-float.rs
@@ -1,19 +1,31 @@
 //@ assembly-output: emit-asm
-//@ only-x86
-// FIXME(#114479): LLVM miscompiles loading and storing `f32` and `f64` when SSE is disabled.
-// There's no compiletest directive to ignore a test on i586 only, so just always explicitly enable
-// SSE2.
-// Use the same target CPU as `i686` so that LLVM orders the instructions in the same order.
-//@ compile-flags: -Ctarget-feature=+sse2 -Ctarget-cpu=pentium4
+//@ revisions: sse nosse
+//@[sse] compile-flags: --target i686-unknown-linux-gnu
+//@[sse] needs-llvm-components: x86
+// We make SSE available but don't use it for the ABI.
+//@[nosse] compile-flags: --target i586-unknown-linux-gnu -Ctarget-feature=+sse2 -Ctarget-cpu=pentium4
+//@[nosse] needs-llvm-components: x86
+
 // Force frame pointers to make ASM more consistent between targets
 //@ compile-flags: -O -C force-frame-pointers
 //@ filecheck-flags: --implicit-check-not fld --implicit-check-not fst
-//@ revisions: normal win
-//@[normal] ignore-windows
-//@[win] only-windows
 
-#![crate_type = "lib"]
 #![feature(f16, f128)]
+#![feature(no_core, lang_items, rustc_attrs, repr_simd)]
+#![no_core]
+#![crate_type = "lib"]
+
+#[lang = "sized"]
+trait Sized {}
+
+#[lang = "copy"]
+trait Copy {}
+
+impl Copy for f16 {}
+impl Copy for f32 {}
+impl Copy for f64 {}
+impl Copy for f128 {}
+impl Copy for usize {}
 
 // Tests that returning `f32` and `f64` with the "Rust" ABI on 32-bit x86 doesn't use the x87
 // floating point stack, as loading and storing `f32`s and `f64`s to and from the x87 stack quietens
@@ -24,7 +36,8 @@
 // CHECK-LABEL: return_f32:
 #[no_mangle]
 pub fn return_f32(x: f32) -> f32 {
-    // CHECK: movl {{.*}}(%ebp), %eax
+    // sse: movss {{.*}}(%ebp), %xmm0
+    // nosse: movl {{.*}}(%ebp), %eax
     // CHECK-NOT: ax
     // CHECK: retl
     x
@@ -33,9 +46,11 @@ pub fn return_f32(x: f32) -> f32 {
 // CHECK-LABEL: return_f64:
 #[no_mangle]
 pub fn return_f64(x: f64) -> f64 {
-    // CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
-    // CHECK-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL:.*]]
-    // CHECK-NEXT: movsd %[[VAL]], (%[[PTR]])
+    // nosse: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
+    // nosse-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL:.*]]
+    // nosse-NEXT: movsd %[[VAL]], (%[[PTR]])
+    // sse: movsd {{.*}}(%ebp), %xmm0
+    // sse-NOT: ax
     // CHECK: retl
     x
 }
@@ -148,7 +163,8 @@ pub unsafe fn call_f32(x: &mut f32) {
     }
     // CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
     // CHECK: calll {{()|_}}get_f32
-    // CHECK-NEXT: movl %eax, (%[[PTR]])
+    // sse-NEXT: movss %xmm0, (%[[PTR]])
+    // nosse-NEXT: movl %eax, (%[[PTR]])
     *x = get_f32();
 }
 
@@ -160,8 +176,9 @@ pub unsafe fn call_f64(x: &mut f64) {
     }
     // CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
     // CHECK: calll {{()|_}}get_f64
-    // CHECK: movsd {{.*}}(%{{ebp|esp}}), %[[VAL:.*]]
-    // CHECK-NEXT: movsd %[[VAL:.*]], (%[[PTR]])
+    // sse: movlps %xmm0, (%[[PTR]])
+    // nosse: movsd {{.*}}(%{{ebp|esp}}), %[[VAL:.*]]
+    // nosse-NEXT: movsd %[[VAL:.*]], (%[[PTR]])
     *x = get_f64();
 }
 
@@ -190,10 +207,8 @@ pub unsafe fn call_f64_f64(x: &mut (f64, f64)) {
     }
     // CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
     // CHECK: calll {{()|_}}get_f64_f64
-    // normal: movsd [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
-    // normal-NEXT: movsd [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
-    // win: movsd (%esp), %[[VAL1:.*]]
-    // win-NEXT: movsd 8(%esp), %[[VAL2:.*]]
+    // CHECK: movsd [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
+    // CHECK-NEXT: movsd [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
     // CHECK-NEXT: movsd %[[VAL1]], (%[[PTR]])
     // CHECK-NEXT: movsd %[[VAL2]], 8(%[[PTR]])
     *x = get_f64_f64();
@@ -207,13 +222,10 @@ pub unsafe fn call_f32_f64(x: &mut (f32, f64)) {
     }
     // CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
     // CHECK: calll {{()|_}}get_f32_f64
-    // normal: movss [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
-    // normal-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL2:.*]]
-    // win: movss (%esp), %[[VAL1:.*]]
-    // win-NEXT: movsd 8(%esp), %[[VAL2:.*]]
+    // CHECK: movss [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
+    // CHECK-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL2:.*]]
     // CHECK-NEXT: movss %[[VAL1]], (%[[PTR]])
-    // normal-NEXT: movsd %[[VAL2]], 4(%[[PTR]])
-    // win-NEXT: movsd %[[VAL2]], 8(%[[PTR]])
+    // CHECK-NEXT: movsd %[[VAL2]], 4(%[[PTR]])
     *x = get_f32_f64();
 }
 
@@ -225,10 +237,8 @@ pub unsafe fn call_f64_f32(x: &mut (f64, f32)) {
     }
     // CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
     // CHECK: calll {{()|_}}get_f64_f32
-    // normal: movsd [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
-    // normal-NEXT: movss [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
-    // win: movsd (%esp), %[[VAL1:.*]]
-    // win-NEXT: movss 8(%esp), %[[VAL2:.*]]
+    // CHECK: movsd [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
+    // CHECK-NEXT: movss [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
     // CHECK-NEXT: movsd %[[VAL1]], (%[[PTR]])
     // CHECK-NEXT: movss %[[VAL2]], 8(%[[PTR]])
     *x = get_f64_f32();
@@ -257,10 +267,8 @@ pub unsafe fn call_f64_other(x: &mut (f64, usize)) {
     }
     // CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
     // CHECK: calll {{()|_}}get_f64_other
-    // normal: movsd [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
-    // normal-NEXT: movl [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
-    // win: movsd (%esp), %[[VAL1:.*]]
-    // win-NEXT: movl 8(%esp), %[[VAL2:.*]]
+    // CHECK: movsd [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
+    // CHECK-NEXT: movl [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
     // CHECK-NEXT: movsd %[[VAL1]], (%[[PTR]])
     // CHECK-NEXT: movl %[[VAL2]], 8(%[[PTR]])
     *x = get_f64_other();
@@ -289,13 +297,10 @@ pub unsafe fn call_other_f64(x: &mut (usize, f64)) {
     }
     // CHECK: movl {{.*}}(%ebp), %[[PTR:.*]]
     // CHECK: calll {{()|_}}get_other_f64
-    // normal: movl [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
-    // normal-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL2:.*]]
-    // win: movl (%esp), %[[VAL1:.*]]
-    // win-NEXT: movsd 8(%esp), %[[VAL2:.*]]
+    // CHECK: movl [[#%d,OFFSET:]](%ebp), %[[VAL1:.*]]
+    // CHECK-NEXT: movsd [[#%d,OFFSET+4]](%ebp), %[[VAL2:.*]]
     // CHECK-NEXT: movl %[[VAL1]], (%[[PTR]])
-    // normal-NEXT: movsd %[[VAL2]], 4(%[[PTR]])
-    // win-NEXT: movsd %[[VAL2]], 8(%[[PTR]])
+    // CHECK-NEXT: movsd %[[VAL2]], 4(%[[PTR]])
     *x = get_other_f64();
 }
 
@@ -307,7 +312,8 @@ pub unsafe fn call_other_f64(x: &mut (usize, f64)) {
 pub fn return_f16(x: f16) -> f16 {
     // CHECK: pushl %ebp
     // CHECK: movl %esp, %ebp
-    // CHECK: movzwl 8(%ebp), %eax
+    // nosse: movzwl 8(%ebp), %eax
+    // sse: pinsrw $0, 8(%ebp), %xmm0
     // CHECK: popl %ebp
     // CHECK: retl
     x
@@ -316,15 +322,18 @@ pub fn return_f16(x: f16) -> f16 {
 // CHECK-LABEL: return_f128:
 #[no_mangle]
 pub fn return_f128(x: f128) -> f128 {
-    // CHECK: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
-    // CHECK-NEXT: movl [[#%d,OFFSET+4]](%ebp), %[[VAL1:.*]]
-    // CHECK-NEXT: movl [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
-    // CHECK-NEXT: movl [[#%d,OFFSET+12]](%ebp), %[[VAL3:.*]]
-    // CHECK-NEXT: movl [[#%d,OFFSET+16]](%ebp), %[[VAL4:.*]]
-    // CHECK-NEXT: movl %[[VAL4:.*]] 12(%[[PTR]])
-    // CHECK-NEXT: movl %[[VAL3:.*]] 8(%[[PTR]])
-    // CHECK-NEXT: movl %[[VAL2:.*]] 4(%[[PTR]])
-    // CHECK-NEXT: movl %[[VAL1:.*]] (%[[PTR]])
+    // CHECK: pushl %ebp
+    // sse: movaps [[#%d,OFFSET:]](%ebp), %xmm0
+    // nosse: movl [[#%d,OFFSET:]](%ebp), %[[PTR:.*]]
+    // nosse-NEXT: movl [[#%d,OFFSET+4]](%ebp), %[[VAL1:.*]]
+    // nosse-NEXT: movl [[#%d,OFFSET+8]](%ebp), %[[VAL2:.*]]
+    // nosse-NEXT: movl [[#%d,OFFSET+12]](%ebp), %[[VAL3:.*]]
+    // nosse-NEXT: movl [[#%d,OFFSET+16]](%ebp), %[[VAL4:.*]]
+    // nosse-NEXT: movl %[[VAL4:.*]] 12(%[[PTR]])
+    // nosse-NEXT: movl %[[VAL3:.*]] 8(%[[PTR]])
+    // nosse-NEXT: movl %[[VAL2:.*]] 4(%[[PTR]])
+    // nosse-NEXT: movl %[[VAL1:.*]] (%[[PTR]])
+    // CHECK: popl %ebp
     // CHECK: retl
     x
 }
diff --git a/tests/codegen/abi-x86-sse.rs b/tests/codegen/abi-x86-sse.rs
new file mode 100644
index 0000000000000..6a0f3dd18d472
--- /dev/null
+++ b/tests/codegen/abi-x86-sse.rs
@@ -0,0 +1,36 @@
+//@ compile-flags: -Z merge-functions=disabled
+
+//@ revisions: x86-64
+//@[x86-64] compile-flags: --target x86_64-unknown-linux-gnu
+//@[x86-64] needs-llvm-components: x86
+
+//@ revisions: x86-32
+//@[x86-32] compile-flags: --target i686-unknown-linux-gnu
+//@[x86-32] needs-llvm-components: x86
+
+//@ revisions: x86-32-nosse
+//@[x86-32-nosse] compile-flags: --target i586-unknown-linux-gnu
+//@[x86-32-nosse] needs-llvm-components: x86
+
+#![feature(no_core, lang_items, rustc_attrs, repr_simd)]
+#![no_core]
+#![crate_type = "lib"]
+
+#[lang = "sized"]
+trait Sized {}
+
+#[lang = "copy"]
+trait Copy {}
+
+// Ensure this type is passed without ptr indirection on targets that
+// require SSE2.
+#[repr(simd)]
+pub struct Sse([f32; 4]);
+
+// x86-64: <4 x float> @sse_id(<4 x float> {{[^,]*}})
+// x86-32: <4 x float> @sse_id(<4 x float> {{[^,]*}})
+// x86-32-nosse: void @sse_id(ptr {{[^,]*}} sret{{[^,]*}}, ptr {{[^,]*}})
+#[no_mangle]
+pub fn sse_id(x: Sse) -> Sse {
+    x
+}
diff --git a/tests/codegen/intrinsics/transmute-x64.rs b/tests/codegen/intrinsics/transmute-x64.rs
index ea1c6b0e7e801..5744c42999975 100644
--- a/tests/codegen/intrinsics/transmute-x64.rs
+++ b/tests/codegen/intrinsics/transmute-x64.rs
@@ -9,9 +9,10 @@ use std::mem::transmute;
 // CHECK-LABEL: @check_sse_float_to_int(
 #[no_mangle]
 pub unsafe fn check_sse_float_to_int(x: __m128) -> __m128i {
-    // CHECK-NOT: alloca
-    // CHECK: %0 = load <4 x float>, ptr %x, align 16
-    // CHECK: store <4 x float> %0, ptr %_0, align 16
+    // FIXME: the MIR opt still works, but the ABI logic now introduces
+    // an alloca here.
+    // CHECK: alloca
+    // CHECK: store <4 x float> %x, ptr %_0, align 16
     transmute(x)
 }
 
diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
index 75f989d6e12c4..bf779e7b1c915 100644
--- a/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
+++ b/tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs
@@ -38,7 +38,7 @@ pub fn build_array_s(x: [f32; 4]) -> S<4> {
 #[no_mangle]
 pub fn build_array_transmute_s(x: [f32; 4]) -> S<4> {
     // CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
-    // CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
+    // CHECK: ret <4 x float> %[[VAL:.+]]
     unsafe { std::mem::transmute(x) }
 }
 
@@ -53,6 +53,6 @@ pub fn build_array_t(x: [f32; 4]) -> T {
 #[no_mangle]
 pub fn build_array_transmute_t(x: [f32; 4]) -> T {
     // CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
-    // CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
+    // CHECK: ret <4 x float> %[[VAL:.+]]
     unsafe { std::mem::transmute(x) }
 }
diff --git a/tests/codegen/simd/packed-simd.rs b/tests/codegen/simd/packed-simd.rs
index 1df09c96e6cc0..a27d5e3af452a 100644
--- a/tests/codegen/simd/packed-simd.rs
+++ b/tests/codegen/simd/packed-simd.rs
@@ -1,4 +1,5 @@
 //@ revisions:opt3 noopt
+//@ only-x86_64
 //@[opt3] compile-flags: -Copt-level=3
 //@[noopt] compile-flags: -Cno-prepopulate-passes
 
@@ -14,14 +15,14 @@ use core::{mem, ptr};
 
 #[repr(simd, packed)]
 #[derive(Copy, Clone)]
-pub struct Simd<T, const N: usize>([T; N]);
+pub struct PackedSimd<T, const N: usize>([T; N]);
 
 #[repr(simd)]
 #[derive(Copy, Clone)]
 pub struct FullSimd<T, const N: usize>([T; N]);
 
 // non-powers-of-two have padding and need to be expanded to full vectors
-fn load<T, const N: usize>(v: Simd<T, N>) -> FullSimd<T, N> {
+fn load<T, const N: usize>(v: PackedSimd<T, N>) -> FullSimd<T, N> {
     unsafe {
         let mut tmp = mem::MaybeUninit::<FullSimd<T, N>>::uninit();
         ptr::copy_nonoverlapping(&v as *const _, tmp.as_mut_ptr().cast(), 1);
@@ -29,18 +30,16 @@ fn load<T, const N: usize>(v: Simd<T, N>) -> FullSimd<T, N> {
     }
 }
 
-// CHECK-LABEL: square_packed_full
-// CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align (8|16)]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
-// CHECK-SAME: ptr{{[a-z_ ]*}} align 4
+// CHECK-LABEL: define <3 x float> @square_packed_full(ptr{{[a-z_ ]*}} align 4 {{[^,]*}})
 #[no_mangle]
-pub fn square_packed_full(x: Simd<f32, 3>) -> FullSimd<f32, 3> {
-    // CHECK-NEXT: start
-    // noopt: alloca [[RET_TYPE]], [[RET_ALIGN]]
-    // CHECK: load <3 x float>
+pub fn square_packed_full(x: PackedSimd<f32, 3>) -> FullSimd<f32, 3> {
+    // The unoptimized version of this is not very interesting to check
+    // since `load` does not get inlined.
+    // opt3-NEXT: start:
+    // opt3-NEXT: load <3 x float>
     let x = load(x);
-    // CHECK: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
-    // CHECK-NEXT: store <3 x float> [[VREG]], ptr [[RET_VREG]], [[RET_ALIGN]]
-    // CHECK-NEXT: ret void
+    // opt3-NEXT: [[VREG:%[a-z0-9_]+]] = fmul <3 x float>
+    // opt3-NEXT: ret <3 x float> [[VREG:%[a-z0-9_]+]]
     unsafe { intrinsics::simd_mul(x, x) }
 }
 
@@ -48,7 +47,7 @@ pub fn square_packed_full(x: Simd<f32, 3>) -> FullSimd<f32, 3> {
 // CHECK-SAME: ptr{{[a-z_ ]*}} sret([[RET_TYPE:[^)]+]]) [[RET_ALIGN:align 4]]{{[^%]*}} [[RET_VREG:%[_0-9]*]]
 // CHECK-SAME: ptr{{[a-z_ ]*}} align 4
 #[no_mangle]
-pub fn square_packed(x: Simd<f32, 3>) -> Simd<f32, 3> {
+pub fn square_packed(x: PackedSimd<f32, 3>) -> PackedSimd<f32, 3> {
     // CHECK-NEXT: start
     // CHECK-NEXT: load <3 x float>
     // noopt-NEXT: load <3 x float>
diff --git a/tests/ui/sse-abi-checks.rs b/tests/ui/sse-simd-abi-checks.rs
similarity index 82%
rename from tests/ui/sse-abi-checks.rs
rename to tests/ui/sse-simd-abi-checks.rs
index d400e6eb698e9..6b4e4afee348a 100644
--- a/tests/ui/sse-abi-checks.rs
+++ b/tests/ui/sse-simd-abi-checks.rs
@@ -1,6 +1,7 @@
 //! Ensure we trigger abi_unsupported_vector_types for target features that are usually enabled
-//! on a target, but disabled in this file via a `-C` flag.
-//@ compile-flags: --crate-type=rlib --target=i586-unknown-linux-gnu -C target-feature=-sse,-sse2
+//! on a target via the base CPU, but disabled in this file via a `-C` flag.
+//@ compile-flags: --crate-type=rlib --target=i586-unknown-linux-gnu
+//@ compile-flags: -Ctarget-cpu=pentium4 -C target-feature=-sse,-sse2
 //@ build-pass
 //@ ignore-pass (test emits codegen-time warnings)
 //@ needs-llvm-components: x86
diff --git a/tests/ui/sse-abi-checks.stderr b/tests/ui/sse-simd-abi-checks.stderr
similarity index 94%
rename from tests/ui/sse-abi-checks.stderr
rename to tests/ui/sse-simd-abi-checks.stderr
index e08b2d4e19179..f777d341b5343 100644
--- a/tests/ui/sse-abi-checks.stderr
+++ b/tests/ui/sse-simd-abi-checks.stderr
@@ -1,5 +1,5 @@
 warning: this function definition uses a SIMD vector type that (with the chosen ABI) requires the `sse` target feature, which is not enabled
-  --> $DIR/sse-abi-checks.rs:21:1
+  --> $DIR/sse-simd-abi-checks.rs:22:1
    |
 LL | pub unsafe extern "C" fn f(_: SseVector) {
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ function defined here
@@ -13,7 +13,7 @@ warning: 1 warning emitted
 
 Future incompatibility report: Future breakage diagnostic:
 warning: this function definition uses a SIMD vector type that (with the chosen ABI) requires the `sse` target feature, which is not enabled
-  --> $DIR/sse-abi-checks.rs:21:1
+  --> $DIR/sse-simd-abi-checks.rs:22:1
    |
 LL | pub unsafe extern "C" fn f(_: SseVector) {
    | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ function defined here