Skip to content

Commit

Permalink
Remove my scalar_copy_backend_type optimization attempt
Browse files Browse the repository at this point in the history
I added this back in 111999, but I no longer think it's a good idea
- It had to get scaled back to only power-of-two things to not break a bunch of targets
- LLVM seems to be getting better at memcpy removal anyway
- Introducing vector instructions has seemed to sometimes (115515) make autovectorization worse

So this removes it from the codegen crates entirely, and instead just tries to use <https://doc.rust-lang.org/nightly/nightly-rustc/rustc_codegen_ssa/traits/builder/trait.BuilderMethods.html#method.typed_place_copy> instead of direct `memcpy` so things will still use load/store for immediates.
  • Loading branch information
scottmcm committed Mar 29, 2024
1 parent 548e14b commit 120cb65
Show file tree
Hide file tree
Showing 9 changed files with 40 additions and 142 deletions.
3 changes: 0 additions & 3 deletions compiler/rustc_codegen_llvm/src/type_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,6 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
ty.llvm_type(self)
}
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
layout.scalar_copy_llvm_type(self)
}
}

impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {
Expand Down
42 changes: 0 additions & 42 deletions compiler/rustc_codegen_llvm/src/type_of.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use rustc_middle::bug;
use rustc_middle::ty::layout::{LayoutOf, TyAndLayout};
use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
use rustc_middle::ty::{self, Ty, TypeVisitableExt};
use rustc_target::abi::HasDataLayout;
use rustc_target::abi::{Abi, Align, FieldsShape};
use rustc_target::abi::{Int, Pointer, F128, F16, F32, F64};
use rustc_target::abi::{Scalar, Size, Variants};
Expand Down Expand Up @@ -166,7 +165,6 @@ pub trait LayoutLlvmExt<'tcx> {
index: usize,
immediate: bool,
) -> &'a Type;
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
}

impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
Expand Down Expand Up @@ -308,44 +306,4 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {

self.scalar_llvm_type_at(cx, scalar)
}

fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
debug_assert!(self.is_sized());

// FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
// (matching the 128-bit SIMD types proposal) and 256 bits on x64
// (like AVX2 registers) seems at least like a tolerable starting point.
let threshold = cx.data_layout().pointer_size * 4;
if self.layout.size() > threshold {
return None;
}

// Vectors, even for non-power-of-two sizes, have the same layout as
// arrays but don't count as aggregate types
// While LLVM theoretically supports non-power-of-two sizes, and they
// often work fine, sometimes x86-isel deals with them horribly
// (see #115212) so for now only use power-of-two ones.
if let FieldsShape::Array { count, .. } = self.layout.fields()
&& count.is_power_of_two()
&& let element = self.field(cx, 0)
&& element.ty.is_integral()
{
// `cx.type_ix(bits)` is tempting here, but while that works great
// for things that *stay* as memory-to-memory copies, it also ends
// up suppressing vectorization as it introduces shifts when it
// extracts all the individual values.

let ety = element.llvm_type(cx);
if *count == 1 {
// Emitting `<1 x T>` would be silly; just use the scalar.
return Some(ety);
} else {
return Some(cx.type_vector(ety, *count));
}
}

// FIXME: The above only handled integer arrays; surely more things
// would also be possible. Be careful about provenance, though!
None
}
}
38 changes: 3 additions & 35 deletions compiler/rustc_codegen_ssa/src/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::mir;
use crate::mir::operand::OperandValue;
use crate::mir::place::PlaceRef;
use crate::traits::*;
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, MemFlags, ModuleCodegen, ModuleKind};
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind};

use rustc_ast::expand::allocator::{global_fn_name, AllocatorKind, ALLOCATOR_METHODS};
use rustc_attr as attr;
Expand All @@ -37,7 +37,7 @@ use rustc_session::config::{self, CrateType, EntryFnType, OutputType};
use rustc_session::Session;
use rustc_span::symbol::sym;
use rustc_span::Symbol;
use rustc_target::abi::{Align, FIRST_VARIANT};
use rustc_target::abi::FIRST_VARIANT;

use std::cmp;
use std::collections::BTreeSet;
Expand Down Expand Up @@ -282,15 +282,7 @@ pub fn coerce_unsized_into<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
}

if src_f.layout.ty == dst_f.layout.ty {
memcpy_ty(
bx,
dst_f.llval,
dst_f.align,
src_f.llval,
src_f.align,
src_f.layout,
MemFlags::empty(),
);
bx.typed_place_copy(dst_f, src_f);
} else {
coerce_unsized_into(bx, src_f, dst_f);
}
Expand Down Expand Up @@ -355,30 +347,6 @@ pub fn wants_new_eh_instructions(sess: &Session) -> bool {
wants_wasm_eh(sess) || wants_msvc_seh(sess)
}

pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
bx: &mut Bx,
dst: Bx::Value,
dst_align: Align,
src: Bx::Value,
src_align: Align,
layout: TyAndLayout<'tcx>,
flags: MemFlags,
) {
let size = layout.size.bytes();
if size == 0 {
return;
}

if flags == MemFlags::empty()
&& let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
{
let temp = bx.load(bty, src, src_align);
bx.store(temp, dst, dst_align);
} else {
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
}
}

pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
cx: &'a Bx::CodegenCx,
instance: Instance<'tcx>,
Expand Down
15 changes: 4 additions & 11 deletions compiler/rustc_codegen_ssa/src/mir/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ use crate::common::{self, IntPredicate};
use crate::errors::CompilerBuiltinsCannotCall;
use crate::meth;
use crate::traits::*;
use crate::MemFlags;

use rustc_ast as ast;
use rustc_ast::{InlineAsmOptions, InlineAsmTemplatePiece};
Expand Down Expand Up @@ -1454,7 +1453,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
}
_ => (op.immediate_or_packed_pair(bx), arg.layout.align.abi, false),
},
Ref(llval, _, align) => match arg.mode {
Ref(llval, llextra, align) => match arg.mode {
PassMode::Indirect { attrs, .. } => {
let required_align = match attrs.pointee_align {
Some(pointee_align) => cmp::max(pointee_align, arg.layout.align.abi),
Expand All @@ -1465,22 +1464,16 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
// alignment requirements may be higher than the type's alignment, so copy
// to a higher-aligned alloca.
let scratch = PlaceRef::alloca_aligned(bx, arg.layout, required_align);
base::memcpy_ty(
bx,
scratch.llval,
scratch.align,
llval,
align,
op.layout,
MemFlags::empty(),
);
let op_place = PlaceRef { llval, llextra, layout: op.layout, align };
bx.typed_place_copy(scratch, op_place);
(scratch.llval, scratch.align, true)
} else {
(llval, align, true)
}
}
_ => (llval, align, true),
},

ZeroSized => match arg.mode {
PassMode::Indirect { on_stack, .. } => {
if on_stack {
Expand Down
4 changes: 2 additions & 2 deletions compiler/rustc_codegen_ssa/src/mir/operand.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use super::place::PlaceRef;
use super::{FunctionCx, LocalRef};

use crate::base;
use crate::size_of_val;
use crate::traits::*;
use crate::MemFlags;
Expand Down Expand Up @@ -419,7 +418,8 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
bx.store_with_flags(val, dest.llval, dest.align, flags);
return;
}
base::memcpy_ty(bx, dest.llval, dest.align, r, source_align, dest.layout, flags)
let size = bx.cx().const_usize(dest.layout.size.bytes());
bx.memcpy(dest.llval, dest.align, r, source_align, size, flags);
}
OperandValue::Ref(_, Some(_), _) => {
bug!("cannot directly store unsized values");
Expand Down
22 changes: 0 additions & 22 deletions compiler/rustc_codegen_ssa/src/traits/type_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,28 +133,6 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
|| self.is_backend_immediate(layout)
|| self.is_backend_scalar_pair(layout))
}

/// A type that can be used in a [`super::BuilderMethods::load`] +
/// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
/// such as a MIR `*_0 = *_1`.
///
/// It's always legal to return `None` here, as the provided impl does,
/// in which case callers should use [`super::BuilderMethods::memcpy`]
/// instead of the `load`+`store` pair.
///
/// This can be helpful for things like arrays, where the LLVM backend type
/// `[3 x i16]` optimizes to three separate loads and stores, but it can
/// instead be copied via an `i48` that stays as the single `load`+`store`.
/// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
/// cases, due to `poison` handling, but in codegen we have more information
/// about the type invariants, so can emit something better instead.)
///
/// This *should* return `None` for particularly-large types, where leaving
/// the `memcpy` may well be important to avoid code size explosion.
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
let _ = layout;
None
}
}

// For backends that support CFI using type membership (i.e., testing whether a given pointer is
Expand Down
44 changes: 25 additions & 19 deletions tests/codegen/array-codegen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,58 @@
// CHECK-LABEL: @array_load
#[no_mangle]
pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
// CHECK: %_0 = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %_0, align 1
// CHECK: %[[TEMP2:.+]] = load i32, ptr %_0, align 1
// CHECK: ret i32 %[[TEMP2]]
// CHECK-NOT: alloca
// CHECK: %[[ALLOCA:.+]] = alloca [4 x i8], align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[ALLOCA]], ptr align 1 %a, {{.+}} 4, i1 false)
// CHECK: %[[TEMP:.+]] = load i32, ptr %[[ALLOCA]], align 1
// CHECK: ret i32 %[[TEMP]]
*a
}

// CHECK-LABEL: @array_store
#[no_mangle]
pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = alloca i32, [[TEMPALIGN:align [0-9]+]]
// CHECK-NOT: alloca
// CHECK: %a = alloca [4 x i8]
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
// CHECK-NOT: alloca
// store i32 %0, ptr %[[TEMP]]
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %a, ptr [[TEMPALIGN]] %[[TEMP]], {{.+}} 4, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %a, {{.+}} 4, i1 false)
*p = a;
}

// CHECK-LABEL: @array_copy
#[no_mangle]
pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
// CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 4, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 4, i1 false)
*p = *a;
}

// CHECK-LABEL: @array_copy_1_element
#[no_mangle]
pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1
// CHECK: store i8 %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load i8, ptr %[[LOCAL]], align 1
// CHECK: store i8 %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 1, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 1, i1 false)
*p = *a;
}

// CHECK-LABEL: @array_copy_2_elements
#[no_mangle]
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <2 x i8>, ptr %a, align 1
// CHECK: store <2 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load <2 x i8>, ptr %[[LOCAL]], align 1
// CHECK: store <2 x i8> %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 2, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 2, i1 false)
*p = *a;
}
8 changes: 4 additions & 4 deletions tests/codegen/array-optimized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
#[no_mangle]
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = load <2 x i8>, ptr %a, align 1
// CHECK: store <2 x i8> %[[TEMP]], ptr %p, align 1
// CHECK: %[[TEMP:.+]] = load i16, ptr %a, align 1
// CHECK: store i16 %[[TEMP]], ptr %p, align 1
// CHECK: ret
*p = *a;
}
Expand All @@ -26,8 +26,8 @@ pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
#[no_mangle]
pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1
// CHECK: %[[TEMP:.+]] = load i32, ptr %a, align 1
// CHECK: store i32 %[[TEMP]], ptr %p, align 1
// CHECK: ret
*p = *a;
}
6 changes: 2 additions & 4 deletions tests/codegen/mem-replace-simple-type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ pub fn replace_short_array_3(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
// CHECK-LABEL: @replace_short_array_4(
pub fn replace_short_array_4(r: &mut [u32; 4], v: [u32; 4]) -> [u32; 4] {
// CHECK-NOT: alloca
// CHECK: %[[R:.+]] = load <4 x i32>, ptr %r, align 4
// CHECK: store <4 x i32> %[[R]], ptr %result
// CHECK: %[[V:.+]] = load <4 x i32>, ptr %v, align 4
// CHECK: store <4 x i32> %[[V]], ptr %r
// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %result, ptr align 4 %r, i64 16, i1 false)
// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %r, ptr align 4 %v, i64 16, i1 false)
std::mem::replace(r, v)
}

0 comments on commit 120cb65

Please sign in to comment.