egraphs: don't let rematerialization override LICM. (#7306)

This reworks the way that remat and LICM interact during aegraph elaboration. In principle, both happen during the same single-pass "code placement" algorithm: we decide where to place pure instructions (those that are eligible for movement), and remat pushes them one way while LICM pushes them the other. The interaction is a little more subtle than simple heuristic priority, though -- it's really a decision ordering issue. A remat'd value wants to sink as deep into the loop nest as it can (to the use's block), but we don't know *where* the uses go until we process them (and make LICM-related choices), and we process uses after defs during elaboration. Or more precisely, we have some work at the use before recursively processing the def, and some work after the recursion returns; and the LICM decision happens after recursion returns, because LICM wants to know where the defs are to know how high we can hoist. (The recursion is itself unrolled into a state machine on an explicit stack so that's a little hard to see but that's what is happening in principle.) The solution here is to make remat a separate just-in-time thing, once we have arg values. Just before we plug the final arg values into the elaborated instruction, we ask: is this a remat'd value, and if so, do we have a copy of the computation in this block yet. If not, we make one. This has to happen in two places (the main elab loop and the toplevel driver from the skeleton). The one downside of this solution is that it doesn't handle *recursive* rematerialization by default. This means that if we, for example, decide to remat single-constant-arg adds (as we actually do in our current rules), we won't then also recursively remat the constant arg to those adds. This can be seen in the `licm.clif` test case. This doesn't seem to be a dealbreaker to me because most such cases will be able to fold the constants anyway (they happen mostly because of pointer pre-computations: a loop over structs in Wasm computes heap_base + p + offset, and naive LICM pulls a `heap_base + offset` out of the loop for every struct field accessed in the loop, with horrible register pressure resulting; that's why we have that remat rule. Most such offsets are pretty small.). Fixes #7283.
bytecodealliance · Oct 20, 2023 · 77d030c · 77d030c
1 parent 3a5e112
commit 77d030c
Show file tree

Hide file tree

Showing 6 changed files with 161 additions and 90 deletions.
diff --git a/cranelift/codegen/src/egraph.rs b/cranelift/codegen/src/egraph.rs
@@ -670,7 +670,7 @@ pub(crate) struct Stats {
     pub(crate) elaborate_visit_node: u64,
     pub(crate) elaborate_memoize_hit: u64,
     pub(crate) elaborate_memoize_miss: u64,
-    pub(crate) elaborate_memoize_miss_remat: u64,
+    pub(crate) elaborate_remat: u64,
     pub(crate) elaborate_licm_hoist: u64,
     pub(crate) elaborate_func: u64,
     pub(crate) elaborate_func_pre_insts: u64,

diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs
@@ -5,7 +5,8 @@ use super::cost::{pure_op_cost, Cost};
 use super::domtree::DomTreeWithChildren;
 use super::Stats;
 use crate::dominator_tree::DominatorTree;
-use crate::fx::FxHashSet;
+use crate::fx::{FxHashMap, FxHashSet};
+use crate::hash_map::Entry as HashEntry;
 use crate::ir::{Block, Function, Inst, Value, ValueDef};
 use crate::loop_analysis::{Loop, LoopAnalysis};
 use crate::scoped_hash_map::ScopedHashMap;
@@ -56,6 +57,8 @@ pub(crate) struct Elaborator<'a> {
     elab_result_stack: Vec<ElaboratedValue>,
     /// Explicitly-unrolled block elaboration stack.
     block_stack: Vec<BlockStackEntry>,
+    /// Copies of values that have been rematerialized.
+    remat_copies: FxHashMap<(Block, Value), Value>,
     /// Stats for various events during egraph processing, to help
     /// with optimization of this infrastructure.
     stats: &'a mut Stats,
@@ -95,7 +98,6 @@ enum ElabStackEntry {
         inst: Inst,
         result_idx: usize,
         num_args: usize,
-        remat: bool,
         before: Inst,
     },
 }
@@ -134,6 +136,7 @@ impl<'a> Elaborator<'a> {
             elab_stack: vec![],
             elab_result_stack: vec![],
             block_stack: vec![],
+            remat_copies: FxHashMap::default(),
             stats,
         }
     }
@@ -256,6 +259,45 @@ impl<'a> Elaborator<'a> {
         self.elab_result_stack.pop().unwrap()
     }
 
+    /// Possibly rematerialize the instruction producing the value in
+    /// `arg` and rewrite `arg` to refer to it, if needed. Returns
+    /// `true` if a rewrite occurred.
+    fn maybe_remat_arg(
+        remat_values: &FxHashSet<Value>,
+        func: &mut Function,
+        remat_copies: &mut FxHashMap<(Block, Value), Value>,
+        insert_block: Block,
+        before: Inst,
+        arg: &mut ElaboratedValue,
+        stats: &mut Stats,
+    ) -> bool {
+        // TODO (#7313): we may want to consider recursive
+        // rematerialization as well. We could process the arguments of
+        // the rematerialized instruction up to a certain depth. This
+        // would affect, e.g., adds-with-one-constant-arg, which are
+        // currently rematerialized. Right now we don't do this, to
+        // avoid the need for another fixpoint loop here.
+        if arg.in_block != insert_block && remat_values.contains(&arg.value) {
+            let new_value = match remat_copies.entry((insert_block, arg.value)) {
+                HashEntry::Occupied(o) => *o.get(),
+                HashEntry::Vacant(v) => {
+                    let inst = func.dfg.value_def(arg.value).inst().unwrap();
+                    debug_assert_eq!(func.dfg.inst_results(inst).len(), 1);
+                    let new_inst = func.dfg.clone_inst(inst);
+                    func.layout.insert_inst(new_inst, before);
+                    let new_result = func.dfg.inst_results(new_inst)[0];
+                    *v.insert(new_result)
+                }
+            };
+            trace!("rematerialized {} as {}", arg.value, new_value);
+            arg.value = new_value;
+            stats.elaborate_remat += 1;
+            true
+        } else {
+            false
+        }
+    }
+
     fn process_elab_stack(&mut self) {
         while let Some(entry) = self.elab_stack.pop() {
             match entry {
@@ -278,39 +320,17 @@ impl<'a> Elaborator<'a> {
                     // eclass.
                     trace!("looking up best value for {}", value);
                     let (_, best_value) = self.value_to_best_value[value];
-                    debug_assert_ne!(best_value, Value::reserved_value());
                     trace!("elaborate: value {} -> best {}", value, best_value);
+                    debug_assert_ne!(best_value, Value::reserved_value());
+
+                    if let Some(elab_val) = self.value_to_elaborated_value.get(&canonical_value) {
+                        // Value is available; use it.
+                        trace!("elaborate: value {} -> {:?}", value, elab_val);
+                        self.stats.elaborate_memoize_hit += 1;
+                        self.elab_result_stack.push(*elab_val);
+                        continue;
+                    }
 
-                    let remat = if let Some(elab_val) =
-                        self.value_to_elaborated_value.get(&canonical_value)
-                    {
-                        // Value is available. Look at the defined
-                        // block, and determine whether this node kind
-                        // allows rematerialization if the value comes
-                        // from another block. If so, ignore the hit
-                        // and recompute below.
-                        let remat = elab_val.in_block != self.cur_block
-                            && self.remat_values.contains(&best_value);
-                        if !remat {
-                            trace!("elaborate: value {} -> {:?}", value, elab_val);
-                            self.stats.elaborate_memoize_hit += 1;
-                            self.elab_result_stack.push(*elab_val);
-                            continue;
-                        }
-                        trace!("elaborate: value {} -> remat", canonical_value);
-                        self.stats.elaborate_memoize_miss_remat += 1;
-                        // The op is pure at this point, so it is always valid to
-                        // remove from this map.
-                        self.value_to_elaborated_value.remove(&canonical_value);
-                        true
-                    } else {
-                        // Value not available; but still look up
-                        // whether it's been flagged for remat because
-                        // this affects placement.
-                        let remat = self.remat_values.contains(&best_value);
-                        trace!(" -> not present in map; remat = {}", remat);
-                        remat
-                    };
                     self.stats.elaborate_memoize_miss += 1;
 
                     // Now resolve the value to its definition to see
@@ -358,7 +378,6 @@ impl<'a> Elaborator<'a> {
                         inst,
                         result_idx,
                         num_args,
-                        remat,
                         before,
                     });
 
@@ -375,23 +394,21 @@ impl<'a> Elaborator<'a> {
                     inst,
                     result_idx,
                     num_args,
-                    remat,
                     before,
                 } => {
                     trace!(
-                        "PendingInst: {} result {} args {} remat {} before {}",
+                        "PendingInst: {} result {} args {} before {}",
                         inst,
                         result_idx,
                         num_args,
-                        remat,
                         before
                     );
 
                     // We should have all args resolved at this
                     // point. Grab them and drain them out, removing
                     // them.
                     let arg_idx = self.elab_result_stack.len() - num_args;
-                    let arg_values = &self.elab_result_stack[arg_idx..];
+                    let arg_values = &mut self.elab_result_stack[arg_idx..];
 
                     // Compute max loop depth.
                     //
@@ -437,16 +454,15 @@ impl<'a> Elaborator<'a> {
 
                     // We know that this is a pure inst, because
                     // non-pure roots have already been placed in the
-                    // value-to-elab'd-value map and are never subject
-                    // to remat, so they will not reach this stage of
-                    // processing.
+                    // value-to-elab'd-value map, so they will not
+                    // reach this stage of processing.
                     //
                     // We now must determine the location at which we
                     // place the instruction. This is the current
                     // block *unless* we hoist above a loop when all
                     // args are loop-invariant (and this op is pure).
                     let (scope_depth, before, insert_block) =
-                        if loop_hoist_level == self.loop_stack.len() || remat {
+                        if loop_hoist_level == self.loop_stack.len() {
                             // Depends on some value at the current
                             // loop depth, or remat forces it here:
                             // place it at the current location.
@@ -479,16 +495,39 @@ impl<'a> Elaborator<'a> {
                         insert_block
                     );
 
-                    //  Now we need to place `inst` at the computed
-                    //  location (just before `before`). Note that
-                    //  `inst` may already have been placed somewhere
-                    //  else, because a pure node may be elaborated at
-                    //  more than one place. In this case, we need to
-                    //  duplicate the instruction (and return the
-                    //  `Value`s for that duplicated instance
-                    //  instead).
+                    // Now that we have the location for the
+                    // instruction, check if any of its args are remat
+                    // values. If so, and if we don't have a copy of
+                    // the rematerializing instruction for this block
+                    // yet, create one.
+                    let mut remat_arg = false;
+                    for arg_value in arg_values.iter_mut() {
+                        if Self::maybe_remat_arg(
+                            &self.remat_values,
+                            &mut self.func,
+                            &mut self.remat_copies,
+                            insert_block,
+                            before,
+                            arg_value,
+                            &mut self.stats,
+                        ) {
+                            remat_arg = true;
+                        }
+                    }
+
+                    // Now we need to place `inst` at the computed
+                    // location (just before `before`). Note that
+                    // `inst` may already have been placed somewhere
+                    // else, because a pure node may be elaborated at
+                    // more than one place. In this case, we need to
+                    // duplicate the instruction (and return the
+                    // `Value`s for that duplicated instance instead).
+                    //
+                    // Also clone if we rematerialized, because we
+                    // don't want to rewrite the args in the original
+                    // copy.
                     trace!("need inst {} before {}", inst, before);
-                    let inst = if self.func.layout.inst_block(inst).is_some() {
+                    let inst = if self.func.layout.inst_block(inst).is_some() || remat_arg {
                         // Clone the inst!
                         let new_inst = self.func.dfg.clone_inst(inst);
                         trace!(
@@ -605,7 +644,16 @@ impl<'a> Elaborator<'a> {
                 // Elaborate the arg, placing any newly-inserted insts
                 // before `before`. Get the updated value, which may
                 // be different than the original.
-                let new_arg = self.elaborate_eclass_use(*arg, before);
+                let mut new_arg = self.elaborate_eclass_use(*arg, before);
+                Self::maybe_remat_arg(
+                    &self.remat_values,
+                    &mut self.func,
+                    &mut self.remat_copies,
+                    block,
+                    inst,
+                    &mut new_arg,
+                    &mut self.stats,
+                );
                 trace!("   -> rewrote arg to {:?}", new_arg);
                 *arg = new_arg.value;
             }

diff --git a/cranelift/codegen/src/scoped_hash_map.rs b/cranelift/codegen/src/scoped_hash_map.rs
@@ -188,19 +188,6 @@ where
             .checked_sub(1)
             .expect("generation_by_depth cannot be empty")
     }
-
-    /// Remote an entry.
-    pub fn remove(&mut self, key: &K) -> Option<V> {
-        self.map.remove(key).and_then(|val| {
-            let entry_generation = val.generation;
-            let entry_depth = val.level as usize;
-            if self.generation_by_depth.get(entry_depth).cloned() == Some(entry_generation) {
-                Some(val.value)
-            } else {
-                None
-            }
-        })
-    }
 }
 
 #[cfg(test)]

diff --git a/cranelift/filetests/filetests/egraph/licm.clif b/cranelift/filetests/filetests/egraph/licm.clif
@@ -20,21 +20,21 @@ block2(v9: i32):
 }
 
 ; check:  block0(v0: i32, v1: i32):
-; nextln:     jump block1(v0)
+; check:      jump block1(v0)
 
 ; check:  block1(v2: i32):
-;; constants are not lifted; they are rematerialized in each block where used
-; check:      v5 = iconst.i32 40
-; check:      v6 = icmp eq v2, v5
-; check:      v3 = iconst.i32 1
-; check:      v8 = iadd v2, v3
-; check:      brif v6, block2, block1(v8)
+;; constants are rematerialized in each block where used
+; check:      v10 = iconst.i32 40
+; check:      v11 = icmp eq v2, v10
+; check:      v12 = iconst.i32 1
+; check:      v13 = iadd v2, v12
+; check:      brif v11, block2, block1(v13)
 
 
 ; check:  block2:
-; check:      v10 = iconst.i32 1
-; check:      v4 = iadd.i32 v1, v10
-; check:      return v4
+; check:      v14 = iconst.i32 1
+; check:      v15 = iadd.i32 v1, v14
+; check:      return v15
 
 function %f(i64x2, i32) -> i64x2 {
 block0(v0: i64x2, v1: i32):
@@ -52,14 +52,14 @@ block2(v8: i64x2):
 }
 
 ; check:  block0(v0: i64x2, v1: i32):
-; nextln:     v4 = vconst.i64x2 const0
+; check:      v4 = vconst.i64x2 const0
 ; nextln:     jump block1(v0, v1)
 ; check:  block1(v2: i64x2, v3: i32):
-; check:      v6 = iconst.i32 1
-; check:      v7 = isub v3, v6
+; check:      v9 = iconst.i32 1
+; check:      v10 = isub v3, v9
 ; check:      v5 = iadd v2, v4
 ; check:      v8 -> v5
-; check:      brif v7, block1(v5, v7), block2
+; check:      brif v10, block1(v5, v10), block2
 ; check:  block2:
 ; check:      return v5
 
@@ -94,13 +94,50 @@ block4:
 ; check:      v8 = vconst.i64x2 const0
 ; check:      jump block2(v3, v1)
 ; check:  block2(v6: i64x2, v7: i32):
-; check:      v10 = iconst.i32 1
-; check:      v11 = isub v7, v10
+; check:      v15 = iconst.i32 1
+; check:      v16 = isub v7, v15
 ; check:      v9 = iadd v6, v8
-; check:      brif v11, block2(v9, v11), block3
+; check:      brif v16, block2(v9, v16), block3
 ; check:  block3:
-; check:      v15 = iconst.i32 1
-; check:      v14 = isub.i32 v5, v15
-; check:      brif v14, block1(v9, v14), block4
+; check:      v17 = iconst.i32 1
+; check:      v18 = isub.i32 v5, v17
+; check:      brif v18, block1(v9, v18), block4
 ; check:  block4:
 ; check:      return v9
+
+;; Don't let a rematerialized iconst inhibit (or even reverse)
+;; LICM. See issue #7283.
+
+function %f(i64, i64) {
+block0(v0: i64, v1: i64):
+    ;; Create a loop-invariant value `v10` which is some operation which
+    ;; includes a constant somewhere.
+    v8 = load.f64 v0+100
+    v9 = f64const 0x1.0000000000000p1
+    v10 = fdiv v8, v9
+
+    ;; jump to the loop
+    v3 = iconst.i64 0
+    jump block2(v3)  ; v3 = 0
+
+block2(v11: i64):
+    ;; store the loop-invariant `v10` to memory "somewhere"
+    v15 = iadd v0, v11
+    store.f64 v10, v15
+
+    ;; loop breakout condition
+    v17 = iadd_imm v11, 1
+    v19 = icmp_imm ne v17, 100
+    brif v19, block2(v17), block1
+
+block1:
+    return
+}
+
+; check: load
+; check: f64const
+; check: fdiv
+; check: block2(v11: i64)
+; check: iadd
+; check: store
+; check: brif
diff --git a/cranelift/filetests/filetests/egraph/load-hoist.clif b/cranelift/filetests/filetests/egraph/load-hoist.clif
@@ -41,4 +41,4 @@ function %foo(i64 vmctx, i64, i32, i32) -> i32 fast {
 ; check: block2(v6: i32, v7: i32, v15: i32):
 ; check:    v10 = iadd.i64 v9, v8
 ; check:    v11 = load.i32 little heap v10
-; check:    brif v17, block2(v12, v14, v17), block4
+; check:    brif v19, block2(v12, v21, v19), block4