From 1dd0d90b69ddde74196fa639dfb2e44897a62f82 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Mon, 9 Nov 2020 19:05:40 -0800 Subject: [PATCH] x64 backend: merge loads into ALU ops when appropriate. This PR makes use of the support in #2366 for sinking effectful instructions and merging them with consumers. In particular, on x86, we want to make use of the ability of many instructions to load one operand directly from memory. That is, instead of this: ``` movq 0(%rdi), %rax addq %rax, %rbx ``` we want to generate this: ``` addq 0(%rdi), %rax ``` As described in more detail in #2366, sinking and merging the load is only possible under certain conditions. In particular, we need to ensure that the use is the *only* use (otherwise the load happens more than once), and we need to ensure that it does not move across other effectful ops (see #2366 for how we ensure this). This change is actually fairly simple, given that all the framework is in place: we simply pattern-match a load on one operand of an ALU instruction that takes an RMI (reg, mem, or immediate) operand, and generate the mem form when we match. Also makes a drive-by improvement in the x64 backend to use statically-monomorphized `LowerCtx` types rather than a `&mut dyn LowerCtx`. On `bz2.wasm`, this results in ~1% instruction-count reduction. More is likely possible by following up with other instructions that can merge memory loads as well. --- cranelift/codegen/src/isa/x64/lower.rs | 154 +++++++++++++----- .../filetests/filetests/isa/x64/load-op.clif | 43 +++++ 2 files changed, 156 insertions(+), 41 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/load-op.clif diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 5261396d20f7..812075612bcd 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -22,9 +22,6 @@ use smallvec::SmallVec; use std::convert::TryFrom; use target_lexicon::Triple; -/// Context passed to all lowering functions. -type Ctx<'a> = &'a mut dyn LowerCtx; - //============================================================================= // Helpers for instruction lowering. @@ -89,32 +86,98 @@ fn matches_input_any>( }) } +fn generate_constant>(ctx: &mut C, ty: Type, c: u64) -> Reg { + let from_bits = ty_bits(ty); + let masked = if from_bits < 64 { + c & ((1u64 << from_bits) - 1) + } else { + c + }; + + let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); + for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| { + ctx.alloc_tmp(reg_class, ty) + }) + .into_iter() + { + ctx.emit(inst); + } + cst_copy.to_reg() +} + /// Put the given input into a register, and mark it as used (side-effect). -fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg { +fn put_input_in_reg>(ctx: &mut C, spec: InsnInput) -> Reg { + let ty = ctx.input_ty(spec.insn, spec.input); let input = ctx.get_input_as_source_or_const(spec.insn, spec.input); if let Some(c) = input.constant { + // Generate constants fresh at each use to minimize long-range register pressure. + generate_constant(ctx, ty, c) + } else { + ctx.put_input_in_reg(spec.insn, spec.input) + } +} + +fn is_mergeable_load>( + ctx: &mut C, + src_insn: IRInst, +) -> Option<(InsnInput, i32)> { + let insn_data = ctx.data(src_insn); + let inputs = ctx.num_inputs(src_insn); + if inputs != 1 { + return None; + } + + let load_ty = ctx.output_ty(src_insn, 0); + if ty_bits(load_ty) < 32 { + // Narrower values are handled by ALU insts that are at least 32 bits + // wide, which is normally OK as we ignore upper buts; but, if we + // generate, e.g., a direct-from-memory 32-bit add for a byte value and + // the byte is the last byte in a page, the extra data that we load is + // incorrectly accessed. So we only allow loads to merge for + // 32-bit-and-above widths. + return None; + } + + // Just testing the opcode is enough, because the width will always match if + // the type does (and the type should match if the CLIF is properly + // constructed). + if insn_data.opcode() == Opcode::Load { + let offset = insn_data + .load_store_offset() + .expect("load should have offset"); + Some(( + InsnInput { + insn: src_insn, + input: 0, + }, + offset, + )) + } else { + None + } +} + +/// Put the given input into a register or a memory operand. +/// Effectful: may mark the given input as used, when returning the register form. +fn input_to_reg_mem>(ctx: &mut C, spec: InsnInput) -> RegMem { + let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input); + + if let Some(c) = inputs.constant { // Generate constants fresh at each use to minimize long-range register pressure. let ty = ctx.input_ty(spec.insn, spec.input); - let from_bits = ty_bits(ty); - let masked = if from_bits < 64 { - c & ((1u64 << from_bits) - 1) - } else { - c - }; + return RegMem::reg(generate_constant(ctx, ty, c)); + } - let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty); - for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| { - ctx.alloc_tmp(reg_class, ty) - }) - .into_iter() - { - ctx.emit(inst); + if let Some((src_insn, 0)) = inputs.inst { + if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) { + ctx.sink_inst(src_insn); + let amode = lower_to_amode(ctx, addr_input, offset); + return RegMem::mem(amode); } - cst_copy.to_reg() - } else { - ctx.put_input_in_reg(spec.insn, spec.input) } + + RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input)) } /// An extension specification for `extend_input_to_reg`. @@ -128,7 +191,11 @@ enum ExtSpec { /// Put the given input into a register, marking it as used, and do a zero- or signed- extension if /// required. (This obviously causes side-effects.) -fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { +fn extend_input_to_reg>( + ctx: &mut C, + spec: InsnInput, + ext_spec: ExtSpec, +) -> Reg { let requested_size = match ext_spec { ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32, ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64, @@ -160,13 +227,6 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg { dst.to_reg() } -/// Put the given input into a register or a memory operand. -/// Effectful: may mark the given input as used, when returning the register form. -fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem { - // TODO handle memory; merge a load directly, if possible. - RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input)) -} - /// Returns whether the given input is an immediate that can be properly sign-extended, without any /// possible side-effect. fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option { @@ -182,20 +242,20 @@ fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option }) } -fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option { +fn input_to_sext_imm>(ctx: &mut C, spec: InsnInput) -> Option { let input = ctx.get_input_as_source_or_const(spec.insn, spec.input); let input_ty = ctx.input_ty(spec.insn, spec.input); non_reg_input_to_sext_imm(input, input_ty) } -fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option { +fn input_to_imm>(ctx: &mut C, spec: InsnInput) -> Option { ctx.get_input_as_source_or_const(spec.insn, spec.input) .constant } /// Put the given input into an immediate, a register or a memory operand. /// Effectful: may mark the given input as used, when returning the register form. -fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm { +fn input_to_reg_mem_imm>(ctx: &mut C, spec: InsnInput) -> RegMemImm { let input = ctx.get_input_as_source_or_const(spec.insn, spec.input); let input_ty = ctx.input_ty(spec.insn, spec.input); match non_reg_input_to_sext_imm(input, input_ty) { @@ -305,7 +365,7 @@ fn emit_extract_lane>( /// /// Note: make sure that there are no instructions modifying the flags between a call to this /// function and the use of the flags! -fn emit_cmp(ctx: Ctx, insn: IRInst) { +fn emit_cmp>(ctx: &mut C, insn: IRInst) { let ty = ctx.input_ty(insn, 0); let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }]; @@ -355,7 +415,12 @@ enum FcmpCondResult { /// /// Note: make sure that there are no instructions modifying the flags between a call to this /// function and the use of the flags! -fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult { +fn emit_fcmp>( + ctx: &mut C, + insn: IRInst, + mut cond_code: FloatCC, + spec: FcmpSpec, +) -> FcmpCondResult { let (flip_operands, inverted_equal) = match cond_code { FloatCC::LessThan | FloatCC::LessThanOrEqual @@ -407,7 +472,12 @@ fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> cond_result } -fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature { +fn make_libcall_sig>( + ctx: &mut C, + insn: IRInst, + call_conv: CallConv, + ptr_ty: Type, +) -> Signature { let mut sig = Signature::new(call_conv); for i in 0..ctx.num_inputs(insn) { sig.params.push(AbiParam::new(ctx.input_ty(insn, i))); @@ -827,14 +897,16 @@ fn lower_insn_to_regs>( | Opcode::Bor | Opcode::Bxor => { // For commutative operations, try to commute operands if one is an - // immediate. - if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) { - (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm)) + // immediate or direct memory reference. Do so by converting LHS to RMI; if + // reg, then always convert RHS to RMI; else, use LHS as RMI and convert + // RHS to reg. + let lhs = input_to_reg_mem_imm(ctx, inputs[0]); + if let RegMemImm::Reg { reg: lhs_reg } = lhs { + let rhs = input_to_reg_mem_imm(ctx, inputs[1]); + (lhs_reg, rhs) } else { - ( - put_input_in_reg(ctx, inputs[0]), - input_to_reg_mem_imm(ctx, inputs[1]), - ) + let rhs_reg = put_input_in_reg(ctx, inputs[1]); + (rhs_reg, lhs) } } Opcode::Isub => ( diff --git a/cranelift/filetests/filetests/isa/x64/load-op.clif b/cranelift/filetests/filetests/isa/x64/load-op.clif new file mode 100644 index 000000000000..002c4b757118 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/load-op.clif @@ -0,0 +1,43 @@ +test compile +target x86_64 +feature "experimental_x64" + +function %add_from_mem_u32_1(i64, i32) -> i32 { +block0(v0: i64, v1: i32): + v2 = load.i32 v0 + v3 = iadd.i32 v2, v1 + ; check: addl 0(%rdi), + return v3 +} + +function %add_from_mem_u32_2(i64, i32) -> i32 { +block0(v0: i64, v1: i32): + v2 = load.i32 v0 + v3 = iadd.i32 v1, v2 + ; check: addl 0(%rdi), + return v3 +} + +function %add_from_mem_u64_1(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = load.i64 v0 + v3 = iadd.i64 v2, v1 + ; check: addq 0(%rdi), + return v3 +} + +function %add_from_mem_u64_2(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = load.i64 v0 + v3 = iadd.i64 v1, v2 + ; check: addq 0(%rdi), + return v3 +} + +function %add_from_mem_not_narrow(i64, i8) -> i8 { +block0(v0: i64, v1: i8): + v2 = load.i8 v0 + v3 = iadd.i8 v2, v1 + ; check: movzbq 0(%rdi), + return v3 +}