From 1dd0d90b69ddde74196fa639dfb2e44897a62f82 Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Mon, 9 Nov 2020 19:05:40 -0800
Subject: [PATCH] x64 backend: merge loads into ALU ops when appropriate.

This PR makes use of the support in #2366 for sinking effectful
instructions and merging them with consumers. In particular, on x86, we
want to make use of the ability of many instructions to load one operand
directly from memory. That is, instead of this:

```
    movq 0(%rdi), %rax
    addq %rax, %rbx
```

we want to generate this:

```
    addq 0(%rdi), %rax
```

As described in more detail in #2366, sinking and merging the load is
only possible under certain conditions. In particular, we need to ensure
that the use is the *only* use (otherwise the load happens more than
once), and we need to ensure that it does not move across other
effectful ops (see #2366 for how we ensure this).

This change is actually fairly simple, given that all the framework is
in place: we simply pattern-match a load on one operand of an ALU
instruction that takes an RMI (reg, mem, or immediate) operand, and
generate the mem form when we match.

Also makes a drive-by improvement in the x64 backend to use
statically-monomorphized `LowerCtx` types rather than a `&mut dyn
LowerCtx`.

On `bz2.wasm`, this results in ~1% instruction-count reduction. More is
likely possible by following up with other instructions that can merge
memory loads as well.
---
 cranelift/codegen/src/isa/x64/lower.rs        | 154 +++++++++++++-----
 .../filetests/filetests/isa/x64/load-op.clif  |  43 +++++
 2 files changed, 156 insertions(+), 41 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/load-op.clif
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 5261396d20f7..812075612bcd 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -22,9 +22,6 @@ use smallvec::SmallVec;
 use std::convert::TryFrom;
 use target_lexicon::Triple;
 
-/// Context passed to all lowering functions.
-type Ctx<'a> = &'a mut dyn LowerCtx<I = Inst>;
-
 //=============================================================================
 // Helpers for instruction lowering.
 
@@ -89,32 +86,98 @@ fn matches_input_any<C: LowerCtx<I = Inst>>(
     })
 }
 
+fn generate_constant<C: LowerCtx<I = Inst>>(ctx: &mut C, ty: Type, c: u64) -> Reg {
+    let from_bits = ty_bits(ty);
+    let masked = if from_bits < 64 {
+        c & ((1u64 << from_bits) - 1)
+    } else {
+        c
+    };
+
+    let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
+    for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
+        ctx.alloc_tmp(reg_class, ty)
+    })
+    .into_iter()
+    {
+        ctx.emit(inst);
+    }
+    cst_copy.to_reg()
+}
+
 /// Put the given input into a register, and mark it as used (side-effect).
-fn put_input_in_reg(ctx: Ctx, spec: InsnInput) -> Reg {
+fn put_input_in_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Reg {
+    let ty = ctx.input_ty(spec.insn, spec.input);
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
 
     if let Some(c) = input.constant {
+        // Generate constants fresh at each use to minimize long-range register pressure.
+        generate_constant(ctx, ty, c)
+    } else {
+        ctx.put_input_in_reg(spec.insn, spec.input)
+    }
+}
+
+fn is_mergeable_load<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    src_insn: IRInst,
+) -> Option<(InsnInput, i32)> {
+    let insn_data = ctx.data(src_insn);
+    let inputs = ctx.num_inputs(src_insn);
+    if inputs != 1 {
+        return None;
+    }
+
+    let load_ty = ctx.output_ty(src_insn, 0);
+    if ty_bits(load_ty) < 32 {
+        // Narrower values are handled by ALU insts that are at least 32 bits
+        // wide, which is normally OK as we ignore upper buts; but, if we
+        // generate, e.g., a direct-from-memory 32-bit add for a byte value and
+        // the byte is the last byte in a page, the extra data that we load is
+        // incorrectly accessed. So we only allow loads to merge for
+        // 32-bit-and-above widths.
+        return None;
+    }
+
+    // Just testing the opcode is enough, because the width will always match if
+    // the type does (and the type should match if the CLIF is properly
+    // constructed).
+    if insn_data.opcode() == Opcode::Load {
+        let offset = insn_data
+            .load_store_offset()
+            .expect("load should have offset");
+        Some((
+            InsnInput {
+                insn: src_insn,
+                input: 0,
+            },
+            offset,
+        ))
+    } else {
+        None
+    }
+}
+
+/// Put the given input into a register or a memory operand.
+/// Effectful: may mark the given input as used, when returning the register form.
+fn input_to_reg_mem<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMem {
+    let inputs = ctx.get_input_as_source_or_const(spec.insn, spec.input);
+
+    if let Some(c) = inputs.constant {
         // Generate constants fresh at each use to minimize long-range register pressure.
         let ty = ctx.input_ty(spec.insn, spec.input);
-        let from_bits = ty_bits(ty);
-        let masked = if from_bits < 64 {
-            c & ((1u64 << from_bits) - 1)
-        } else {
-            c
-        };
+        return RegMem::reg(generate_constant(ctx, ty, c));
+    }
 
-        let cst_copy = ctx.alloc_tmp(Inst::rc_for_type(ty).unwrap(), ty);
-        for inst in Inst::gen_constant(cst_copy, masked, ty, |reg_class, ty| {
-            ctx.alloc_tmp(reg_class, ty)
-        })
-        .into_iter()
-        {
-            ctx.emit(inst);
+    if let Some((src_insn, 0)) = inputs.inst {
+        if let Some((addr_input, offset)) = is_mergeable_load(ctx, src_insn) {
+            ctx.sink_inst(src_insn);
+            let amode = lower_to_amode(ctx, addr_input, offset);
+            return RegMem::mem(amode);
         }
-        cst_copy.to_reg()
-    } else {
-        ctx.put_input_in_reg(spec.insn, spec.input)
     }
+
+    RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
 }
 
 /// An extension specification for `extend_input_to_reg`.
@@ -128,7 +191,11 @@ enum ExtSpec {
 
 /// Put the given input into a register, marking it as used, and do a zero- or signed- extension if
 /// required. (This obviously causes side-effects.)
-fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
+fn extend_input_to_reg<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    spec: InsnInput,
+    ext_spec: ExtSpec,
+) -> Reg {
     let requested_size = match ext_spec {
         ExtSpec::ZeroExtendTo32 | ExtSpec::SignExtendTo32 => 32,
         ExtSpec::ZeroExtendTo64 | ExtSpec::SignExtendTo64 => 64,
@@ -160,13 +227,6 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
     dst.to_reg()
 }
 
-/// Put the given input into a register or a memory operand.
-/// Effectful: may mark the given input as used, when returning the register form.
-fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
-    // TODO handle memory; merge a load directly, if possible.
-    RegMem::reg(ctx.put_input_in_reg(spec.insn, spec.input))
-}
-
 /// Returns whether the given input is an immediate that can be properly sign-extended, without any
 /// possible side-effect.
 fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32> {
@@ -182,20 +242,20 @@ fn non_reg_input_to_sext_imm(input: NonRegInput, input_ty: Type) -> Option<u32>
     })
 }
 
-fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
+fn input_to_sext_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u32> {
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
     let input_ty = ctx.input_ty(spec.insn, spec.input);
     non_reg_input_to_sext_imm(input, input_ty)
 }
 
-fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
+fn input_to_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> Option<u64> {
     ctx.get_input_as_source_or_const(spec.insn, spec.input)
         .constant
 }
 
 /// Put the given input into an immediate, a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
-fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
+fn input_to_reg_mem_imm<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput) -> RegMemImm {
     let input = ctx.get_input_as_source_or_const(spec.insn, spec.input);
     let input_ty = ctx.input_ty(spec.insn, spec.input);
     match non_reg_input_to_sext_imm(input, input_ty) {
@@ -305,7 +365,7 @@ fn emit_extract_lane<C: LowerCtx<I = Inst>>(
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
-fn emit_cmp(ctx: Ctx, insn: IRInst) {
+fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
     let ty = ctx.input_ty(insn, 0);
 
     let inputs = [InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
@@ -355,7 +415,12 @@ enum FcmpCondResult {
 ///
 /// Note: make sure that there are no instructions modifying the flags between a call to this
 /// function and the use of the flags!
-fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
+fn emit_fcmp<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+    mut cond_code: FloatCC,
+    spec: FcmpSpec,
+) -> FcmpCondResult {
     let (flip_operands, inverted_equal) = match cond_code {
         FloatCC::LessThan
         | FloatCC::LessThanOrEqual
@@ -407,7 +472,12 @@ fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) ->
     cond_result
 }
 
-fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature {
+fn make_libcall_sig<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    insn: IRInst,
+    call_conv: CallConv,
+    ptr_ty: Type,
+) -> Signature {
     let mut sig = Signature::new(call_conv);
     for i in 0..ctx.num_inputs(insn) {
         sig.params.push(AbiParam::new(ctx.input_ty(insn, i)));
@@ -827,14 +897,16 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     | Opcode::Bor
                     | Opcode::Bxor => {
                         // For commutative operations, try to commute operands if one is an
-                        // immediate.
-                        if let Some(imm) = input_to_sext_imm(ctx, inputs[0]) {
-                            (put_input_in_reg(ctx, inputs[1]), RegMemImm::imm(imm))
+                        // immediate or direct memory reference. Do so by converting LHS to RMI; if
+                        // reg, then always convert RHS to RMI; else, use LHS as RMI and convert
+                        // RHS to reg.
+                        let lhs = input_to_reg_mem_imm(ctx, inputs[0]);
+                        if let RegMemImm::Reg { reg: lhs_reg } = lhs {
+                            let rhs = input_to_reg_mem_imm(ctx, inputs[1]);
+                            (lhs_reg, rhs)
                         } else {
-                            (
-                                put_input_in_reg(ctx, inputs[0]),
-                                input_to_reg_mem_imm(ctx, inputs[1]),
-                            )
+                            let rhs_reg = put_input_in_reg(ctx, inputs[1]);
+                            (rhs_reg, lhs)
                         }
                     }
                     Opcode::Isub => (
diff --git a/cranelift/filetests/filetests/isa/x64/load-op.clif b/cranelift/filetests/filetests/isa/x64/load-op.clif
new file mode 100644
index 000000000000..002c4b757118
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/load-op.clif
@@ -0,0 +1,43 @@
+test compile
+target x86_64
+feature "experimental_x64"
+
+function %add_from_mem_u32_1(i64, i32) -> i32 {
+block0(v0: i64, v1: i32):
+  v2 = load.i32 v0
+  v3 = iadd.i32 v2, v1
+  ; check: addl    0(%rdi),
+  return v3
+}
+
+function %add_from_mem_u32_2(i64, i32) -> i32 {
+block0(v0: i64, v1: i32):
+  v2 = load.i32 v0
+  v3 = iadd.i32 v1, v2
+  ; check: addl    0(%rdi),
+  return v3
+}
+
+function %add_from_mem_u64_1(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = load.i64 v0
+  v3 = iadd.i64 v2, v1
+  ; check: addq    0(%rdi),
+  return v3
+}
+
+function %add_from_mem_u64_2(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = load.i64 v0
+  v3 = iadd.i64 v1, v2
+  ; check: addq    0(%rdi),
+  return v3
+}
+
+function %add_from_mem_not_narrow(i64, i8) -> i8 {
+block0(v0: i64, v1: i8):
+  v2 = load.i8 v0
+  v3 = iadd.i8 v2, v1
+  ; check: movzbq  0(%rdi),
+  return v3
+}