Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove my scalar_copy_backend_type optimization attempt #123185

Merged
merged 3 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions compiler/rustc_codegen_llvm/src/type_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,6 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
ty.llvm_type(self)
}
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
layout.scalar_copy_llvm_type(self)
}
}

impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {
Expand Down
42 changes: 0 additions & 42 deletions compiler/rustc_codegen_llvm/src/type_of.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use rustc_middle::bug;
use rustc_middle::ty::layout::{LayoutOf, TyAndLayout};
use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
use rustc_middle::ty::{self, Ty, TypeVisitableExt};
use rustc_target::abi::HasDataLayout;
use rustc_target::abi::{Abi, Align, FieldsShape};
use rustc_target::abi::{Int, Pointer, F128, F16, F32, F64};
use rustc_target::abi::{Scalar, Size, Variants};
Expand Down Expand Up @@ -166,7 +165,6 @@ pub trait LayoutLlvmExt<'tcx> {
index: usize,
immediate: bool,
) -> &'a Type;
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
}

impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
Expand Down Expand Up @@ -308,44 +306,4 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {

self.scalar_llvm_type_at(cx, scalar)
}

fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
debug_assert!(self.is_sized());

// FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
// (matching the 128-bit SIMD types proposal) and 256 bits on x64
// (like AVX2 registers) seems at least like a tolerable starting point.
let threshold = cx.data_layout().pointer_size * 4;
if self.layout.size() > threshold {
return None;
}

// Vectors, even for non-power-of-two sizes, have the same layout as
// arrays but don't count as aggregate types
// While LLVM theoretically supports non-power-of-two sizes, and they
// often work fine, sometimes x86-isel deals with them horribly
// (see #115212) so for now only use power-of-two ones.
if let FieldsShape::Array { count, .. } = self.layout.fields()
&& count.is_power_of_two()
&& let element = self.field(cx, 0)
&& element.ty.is_integral()
{
// `cx.type_ix(bits)` is tempting here, but while that works great
// for things that *stay* as memory-to-memory copies, it also ends
// up suppressing vectorization as it introduces shifts when it
// extracts all the individual values.

let ety = element.llvm_type(cx);
if *count == 1 {
// Emitting `<1 x T>` would be silly; just use the scalar.
return Some(ety);
} else {
return Some(cx.type_vector(ety, *count));
}
}

// FIXME: The above only handled integer arrays; surely more things
// would also be possible. Be careful about provenance, though!
None
}
}
38 changes: 3 additions & 35 deletions compiler/rustc_codegen_ssa/src/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::mir;
use crate::mir::operand::OperandValue;
use crate::mir::place::PlaceRef;
use crate::traits::*;
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, MemFlags, ModuleCodegen, ModuleKind};
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind};

use rustc_ast::expand::allocator::{global_fn_name, AllocatorKind, ALLOCATOR_METHODS};
use rustc_attr as attr;
Expand All @@ -37,7 +37,7 @@ use rustc_session::config::{self, CrateType, EntryFnType, OptLevel, OutputType};
use rustc_session::Session;
use rustc_span::symbol::sym;
use rustc_span::Symbol;
use rustc_target::abi::{Align, FIRST_VARIANT};
use rustc_target::abi::FIRST_VARIANT;

use std::cmp;
use std::collections::BTreeSet;
Expand Down Expand Up @@ -282,15 +282,7 @@ pub fn coerce_unsized_into<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
}

if src_f.layout.ty == dst_f.layout.ty {
memcpy_ty(
bx,
dst_f.llval,
dst_f.align,
src_f.llval,
src_f.align,
src_f.layout,
MemFlags::empty(),
);
bx.typed_place_copy(dst_f, src_f);
} else {
coerce_unsized_into(bx, src_f, dst_f);
}
Expand Down Expand Up @@ -382,30 +374,6 @@ pub fn wants_new_eh_instructions(sess: &Session) -> bool {
wants_wasm_eh(sess) || wants_msvc_seh(sess)
}

pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
bx: &mut Bx,
dst: Bx::Value,
dst_align: Align,
src: Bx::Value,
src_align: Align,
layout: TyAndLayout<'tcx>,
flags: MemFlags,
) {
let size = layout.size.bytes();
if size == 0 {
return;
}

if flags == MemFlags::empty()
&& let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
{
let temp = bx.load(bty, src, src_align);
bx.store(temp, dst, dst_align);
} else {
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
}
}

pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
cx: &'a Bx::CodegenCx,
instance: Instance<'tcx>,
Expand Down
13 changes: 3 additions & 10 deletions compiler/rustc_codegen_ssa/src/mir/block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1459,7 +1459,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
}
_ => (op.immediate_or_packed_pair(bx), arg.layout.align.abi, false),
},
Ref(llval, _, align) => match arg.mode {
Ref(llval, llextra, align) => match arg.mode {
PassMode::Indirect { attrs, .. } => {
let required_align = match attrs.pointee_align {
Some(pointee_align) => cmp::max(pointee_align, arg.layout.align.abi),
Expand All @@ -1470,15 +1470,8 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
// alignment requirements may be higher than the type's alignment, so copy
// to a higher-aligned alloca.
let scratch = PlaceRef::alloca_aligned(bx, arg.layout, required_align);
base::memcpy_ty(
bx,
scratch.llval,
scratch.align,
llval,
align,
op.layout,
MemFlags::empty(),
);
let op_place = PlaceRef { llval, llextra, layout: op.layout, align };
bx.typed_place_copy(scratch, op_place);
(scratch.llval, scratch.align, true)
} else {
(llval, align, true)
Expand Down
16 changes: 5 additions & 11 deletions compiler/rustc_codegen_ssa/src/mir/operand.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use super::place::PlaceRef;
use super::{FunctionCx, LocalRef};

use crate::base;
use crate::size_of_val;
use crate::traits::*;
use crate::MemFlags;
Expand Down Expand Up @@ -398,7 +397,7 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
self.store_with_flags(bx, dest, MemFlags::NONTEMPORAL);
}

fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
pub(crate) fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
self,
bx: &mut Bx,
dest: PlaceRef<'tcx, V>,
Expand All @@ -410,16 +409,11 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
// Avoid generating stores of zero-sized values, because the only way to have a zero-sized
// value is through `undef`/`poison`, and the store itself is useless.
}
OperandValue::Ref(r, None, source_align) => {
OperandValue::Ref(llval, llextra @ None, source_align) => {
assert!(dest.layout.is_sized(), "cannot directly store unsized values");
if flags.contains(MemFlags::NONTEMPORAL) {
// HACK(nox): This is inefficient but there is no nontemporal memcpy.
let ty = bx.backend_type(dest.layout);
let val = bx.load(ty, r, source_align);
bx.store_with_flags(val, dest.llval, dest.align, flags);
return;
}
base::memcpy_ty(bx, dest.llval, dest.align, r, source_align, dest.layout, flags)
let source_place =
PlaceRef { llval, llextra, align: source_align, layout: dest.layout };
bx.typed_place_copy_with_flags(dest, source_place, flags);
}
OperandValue::Ref(_, Some(_), _) => {
bug!("cannot directly store unsized values");
Expand Down
25 changes: 20 additions & 5 deletions compiler/rustc_codegen_ssa/src/traits/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,17 +281,32 @@ pub trait BuilderMethods<'a, 'tcx>:
dst: PlaceRef<'tcx, Self::Value>,
src: PlaceRef<'tcx, Self::Value>,
) {
debug_assert!(src.llextra.is_none());
debug_assert!(dst.llextra.is_none());
self.typed_place_copy_with_flags(dst, src, MemFlags::empty());
}

fn typed_place_copy_with_flags(
&mut self,
dst: PlaceRef<'tcx, Self::Value>,
src: PlaceRef<'tcx, Self::Value>,
flags: MemFlags,
) {
debug_assert!(src.llextra.is_none(), "cannot directly copy from unsized values");
debug_assert!(dst.llextra.is_none(), "cannot directly copy into unsized values");
debug_assert_eq!(dst.layout.size, src.layout.size);
if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout) {
if flags.contains(MemFlags::NONTEMPORAL) {
// HACK(nox): This is inefficient but there is no nontemporal memcpy.
let ty = self.backend_type(dst.layout);
let val = self.load(ty, src.llval, src.align);
self.store_with_flags(val, dst.llval, dst.align, flags);
} else if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout)
{
// If we're not optimizing, the aliasing information from `memcpy`
// isn't useful, so just load-store the value for smaller code.
let temp = self.load_operand(src);
temp.val.store(self, dst);
temp.val.store_with_flags(self, dst, flags);
} else if !dst.layout.is_zst() {
let bytes = self.const_usize(dst.layout.size.bytes());
self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, MemFlags::empty());
self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, flags);
}
}

Expand Down
22 changes: 0 additions & 22 deletions compiler/rustc_codegen_ssa/src/traits/type_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,28 +133,6 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
|| self.is_backend_immediate(layout)
|| self.is_backend_scalar_pair(layout))
}

/// A type that can be used in a [`super::BuilderMethods::load`] +
/// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
/// such as a MIR `*_0 = *_1`.
///
/// It's always legal to return `None` here, as the provided impl does,
/// in which case callers should use [`super::BuilderMethods::memcpy`]
/// instead of the `load`+`store` pair.
///
/// This can be helpful for things like arrays, where the LLVM backend type
/// `[3 x i16]` optimizes to three separate loads and stores, but it can
/// instead be copied via an `i48` that stays as the single `load`+`store`.
/// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
/// cases, due to `poison` handling, but in codegen we have more information
/// about the type invariants, so can emit something better instead.)
///
/// This *should* return `None` for particularly-large types, where leaving
/// the `memcpy` may well be important to avoid code size explosion.
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
let _ = layout;
None
}
}

// For backends that support CFI using type membership (i.e., testing whether a given pointer is
Expand Down
44 changes: 25 additions & 19 deletions tests/codegen/array-codegen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,58 @@
// CHECK-LABEL: @array_load
#[no_mangle]
pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
// CHECK: %_0 = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %_0, align 1
// CHECK: %[[TEMP2:.+]] = load i32, ptr %_0, align 1
// CHECK: ret i32 %[[TEMP2]]
// CHECK-NOT: alloca
// CHECK: %[[ALLOCA:.+]] = alloca [4 x i8], align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[ALLOCA]], ptr align 1 %a, {{.+}} 4, i1 false)
// CHECK: %[[TEMP:.+]] = load i32, ptr %[[ALLOCA]], align 1
// CHECK: ret i32 %[[TEMP]]
*a
}

// CHECK-LABEL: @array_store
#[no_mangle]
pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = alloca i32, [[TEMPALIGN:align [0-9]+]]
// CHECK-NOT: alloca
// CHECK: %a = alloca [4 x i8]
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
// CHECK-NOT: alloca
// store i32 %0, ptr %[[TEMP]]
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %a, ptr [[TEMPALIGN]] %[[TEMP]], {{.+}} 4, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %a, {{.+}} 4, i1 false)
*p = a;
}

// CHECK-LABEL: @array_copy
#[no_mangle]
pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
// CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 4, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 4, i1 false)
*p = *a;
}

// CHECK-LABEL: @array_copy_1_element
#[no_mangle]
pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1
// CHECK: store i8 %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load i8, ptr %[[LOCAL]], align 1
// CHECK: store i8 %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 1, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 1, i1 false)
*p = *a;
}

// CHECK-LABEL: @array_copy_2_elements
#[no_mangle]
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <2 x i8>, ptr %a, align 1
// CHECK: store <2 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load <2 x i8>, ptr %[[LOCAL]], align 1
// CHECK: store <2 x i8> %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 2, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 2, i1 false)
*p = *a;
}
8 changes: 4 additions & 4 deletions tests/codegen/array-optimized.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
#[no_mangle]
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = load <2 x i8>, ptr %a, align 1
// CHECK: store <2 x i8> %[[TEMP]], ptr %p, align 1
// CHECK: %[[TEMP:.+]] = load i16, ptr %a, align 1
// CHECK: store i16 %[[TEMP]], ptr %p, align 1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that while we now generate an alloca and memcpys for this (as seen in the array-codegen file), LLVM is able to remove them.

If LLVM picks i16 for this (and not <2 x i8>), then great, let's do that.

// CHECK: ret
*p = *a;
}
Expand All @@ -26,8 +26,8 @@ pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
#[no_mangle]
pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1
// CHECK: %[[TEMP:.+]] = load i32, ptr %a, align 1
// CHECK: store i32 %[[TEMP]], ptr %p, align 1
// CHECK: ret
*p = *a;
}
Loading
Loading