From ad9f4254dc64895e2fd3cf9b0f51cedaa6df1c76 Mon Sep 17 00:00:00 2001
From: Laurence Tratt <laurie@tratt.net>
Date: Thu, 21 Nov 2024 10:08:23 +0000
Subject: [PATCH 1/2] Deal with input register aliasing.

Currently our register allocator assumes that SSA variables can't alias.
This is true for the parts under our control -- but isn't true of the
inputs to a trace! For example both `%0` and `%1` might come from `RAX`.
This caused a panic in the register allocator.

I'd noted the problem in https://github.com/ykjit/yk/issues/1449 but
thought it happened rarely. Edd noticed that it happens in other, more
common situations (including as a result of common subexpression
elimination), including in a yklua benchmark, and created a test for it.

This commit is a "quick fix" (it's not pretty, or efficient): when we
spot aliasing, we spill aliased variables onto the stack, implicitly
dealiasing them. In other words, we avoid having to think about aliasing
in the main bulk of the register allocator at the expense of having to
spill (and potentially unspill) values unnecessarily.

Co-authored-by: Edd Barrett <vext01@gmail.com>
---
 .../compile/jitc_yk/codegen/x64/lsregalloc.rs | 30 ++++++++++++++----
 ykrt/src/compile/jitc_yk/codegen/x64/mod.rs   | 31 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs b/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs
index 529d5c6fd..bf9d8aa58 100644
--- a/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs
+++ b/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs
@@ -252,12 +252,30 @@ impl<'a> LSRegAlloc<'a> {
 
 /// The parts of the register allocator needed for general purpose registers.
 impl LSRegAlloc<'_> {
-    /// Forcibly assign the machine register `reg`, which must be in the [RegState::Empty] state,
-    /// to the value produced by instruction `iidx`.
-    pub(crate) fn force_assign_inst_gp_reg(&mut self, iidx: InstIdx, reg: Rq) {
-        debug_assert!(!self.gp_regset.is_set(reg));
-        self.gp_regset.set(reg);
-        self.gp_reg_states[usize::from(reg.code())] = RegState::FromInst(iidx);
+    /// Forcibly assign the machine register `reg` to the value produced by instruction `iidx`.
+    /// Note that if this register is already used, a spill will be generated instead.
+    pub(crate) fn force_assign_inst_gp_reg(&mut self, asm: &mut Assembler, iidx: InstIdx, reg: Rq) {
+        if self.gp_regset.is_set(reg) {
+            debug_assert_eq!(self.spills[usize::from(iidx)], SpillState::Empty);
+            // Input values alias to a single register. To avoid the rest of the register allocator
+            // having to think about this, we "dealias" the values by spilling.
+            let inst = self.m.inst_no_copies(iidx);
+            let size = inst.def_byte_size(self.m);
+            self.stack.align(size); // FIXME
+            let frame_off = self.stack.grow(size);
+            let off = i32::try_from(frame_off).unwrap();
+            match size {
+                1 => dynasm!(asm; mov BYTE [rbp - off], Rb(reg.code())),
+                2 => dynasm!(asm; mov WORD [rbp - off], Rw(reg.code())),
+                4 => dynasm!(asm; mov DWORD [rbp - off], Rd(reg.code())),
+                8 => dynasm!(asm; mov QWORD [rbp - off], Rq(reg.code())),
+                _ => unreachable!(),
+            }
+            self.spills[usize::from(iidx)] = SpillState::Stack(off);
+        } else {
+            self.gp_regset.set(reg);
+            self.gp_reg_states[usize::from(reg.code())] = RegState::FromInst(iidx);
+        }
     }
 
     /// Forcibly assign the floating point register `reg`, which must be in the [RegState::Empty] state,
diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs b/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs
index ffca2819c..78ff3ec4f 100644
--- a/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs
+++ b/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs
@@ -1154,7 +1154,7 @@ impl<'a> Assemble<'a> {
         debug_assert!(size <= REG64_BYTESIZE);
         match m {
             VarLocation::Register(reg_alloc::Register::GP(reg)) => {
-                self.ra.force_assign_inst_gp_reg(iidx, reg);
+                self.ra.force_assign_inst_gp_reg(&mut self.asm, iidx, reg);
             }
             VarLocation::Register(reg_alloc::Register::FP(reg)) => {
                 self.ra.force_assign_inst_fp_reg(iidx, reg);
@@ -4381,4 +4381,33 @@ mod tests {
             ",
         );
     }
+
+    #[test]
+    fn cg_aliasing_loadtis() {
+        let mut m = jit_ir::Module::new(0, 0).unwrap();
+
+        // Create two trace inputs whose locations alias.
+        let loc = yksmp::Location::Register(13, 1, 0, [].into());
+        m.push_tiloc(loc);
+        let ti_inst = jit_ir::LoadTraceInputInst::new(0, m.int8_tyidx());
+        let op1 = m.push_and_make_operand(ti_inst.clone().into()).unwrap();
+        let op2 = m.push_and_make_operand(ti_inst.into()).unwrap();
+
+        let add_inst = jit_ir::BinOpInst::new(op1, jit_ir::BinOp::Add, op2);
+        m.push(add_inst.into()).unwrap();
+
+        let mt = MT::new().unwrap();
+        let hl = HotLocation {
+            kind: HotLocationKind::Tracing,
+            tracecompilation_errors: 0,
+        };
+
+        Assemble::new(&m, None, None)
+            .unwrap()
+            .codegen(mt, Arc::new(Mutex::new(hl)), None)
+            .unwrap()
+            .as_any()
+            .downcast::<X64CompiledTrace>()
+            .unwrap();
+    }
 }

From 844d090fcac5983ceee268247fa47ceaa13a27be Mon Sep 17 00:00:00 2001
From: Edd Barrett <vext01@gmail.com>
Date: Mon, 25 Nov 2024 10:19:42 +0000
Subject: [PATCH 2/2] Fix a side-tracing + hardware tracing bug.

When we are constructing a side trace we will have first deopted back to
the conditional branch that initiated side-tracing. Because the
conditional branch is re-executed, this means that when we are doing
hardware tracing, we will likely see a spurious blocks at the beginning
of the trace that are not desirable in the side-trace. We therefore skip
them.

Fixes crashes like this, where we get a local_map miss for local
variables relating to the conditional branch:

thread '<unnamed>' panicked at ykrt/src/compile/jitc_yk/trace_builder.rs:404:69:
no entry found for key

Co-authored-by: Lukas Diekmann <lukas.diekmann@gmail.com>
---
 ykrt/src/compile/jitc_yk/trace_builder.rs | 62 ++++++++++++++++-------
 ykrt/src/trace/hwt/mapper.rs              | 13 ++++-
 2 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/ykrt/src/compile/jitc_yk/trace_builder.rs b/ykrt/src/compile/jitc_yk/trace_builder.rs
index c1db77998..7203e84a2 100644
--- a/ykrt/src/compile/jitc_yk/trace_builder.rs
+++ b/ykrt/src/compile/jitc_yk/trace_builder.rs
@@ -1202,29 +1202,53 @@ impl TraceBuilder {
                 })
                 .collect::<Vec<_>>();
 
-            #[cfg(tracer_swt)]
+            // When side-tracing a switch guard failure, we need to reprocess the switch statement
+            // (and only the switch statement) in order to emit a guard at the beginning of the
+            // side-trace to check that the case requiring execution is that case the trace
+            // captures.
+            //
+            // Note that it is not necessary to emit such a guard into side-traces stemming from
+            // regular conditionals, since a conditional has only two sucessors. The parent trace
+            // captures one, so by construction the side trace must capture the other.
+            let prevbb = self.aot_mod.bblock(prev_bid.as_ref().unwrap());
+            if let aot_ir::Inst::Switch {
+                test_val,
+                default_dest,
+                case_values,
+                case_dests,
+                safepoint,
+            } = &prevbb.insts.last().unwrap()
             {
-                // FIXME: This is a hack! When we side-trace the guard failure of a switch
-                // statement, the software-tracer doesn't report the switch-statement block as the
-                // first block in the trace. This means we don't generate a guard at the top of the
-                // side-trace checking that the switch-case is correct when executing the trace. We
-                // work around this force processing the previous block that's passed in via
-                // `SideTraceInfo` if it's last instruction is a switch statement. Technically,
-                // this is the correct fix for hardware-tracing too, since we shouldn't re-process
-                // the switch-statement block, as it was already executed in the parent trace,
-                // which is problematic if the block has side-effects.
-                let prevbb = self.aot_mod.bblock(prev_bid.as_ref().unwrap());
-                if matches!(prevbb.insts.last(), Some(aot_ir::Inst::Switch { .. })) {
-                    let nextbb = match &tas.first() {
-                        Some(b) => self.lookup_aot_block(b),
-                        _ => panic!(),
-                    };
-                    self.process_block(prev_bid.as_ref().unwrap(), &None, nextbb)?;
-                }
+                let nextbb = match &tas.first() {
+                    Some(b) => self.lookup_aot_block(b),
+                    _ => panic!(),
+                };
+                self.handle_switch(
+                    prev_bid.as_ref().unwrap(), // this is safe, we've just created this above
+                    prevbb.insts.len() - 1,
+                    safepoint,
+                    nextbb.as_ref().unwrap(),
+                    test_val,
+                    default_dest,
+                    case_values,
+                    case_dests,
+                )?;
             }
         }
-
+        // The variable `prev_bid` contains the block of the guard that initiated side-tracing (for
+        // normal traces this is set to `None`). When hardware tracing, we capture this block again
+        // as part of the side-trace. However, since we've already processed this block in the
+        // parent trace, we must not process it again in the side-trace.
+        //
+        // Typically, the mapper would strip this block for us, but for codegen related reasons,
+        // e.g. a switch statement codegenning to many machine blocks, it's possible for multiple
+        // duplicates of this same block to show up here, which all need to be skipped.
         let mut trace_iter = tas.into_iter().peekable();
+        if prev_bid.is_some() {
+            while self.lookup_aot_block(trace_iter.peek().unwrap()) == prev_bid {
+                trace_iter.next().unwrap();
+            }
+        }
 
         if sti.is_none() {
             // Find the block containing the control point call. This is the (sole) predecessor of the
diff --git a/ykrt/src/trace/hwt/mapper.rs b/ykrt/src/trace/hwt/mapper.rs
index cc86d25cb..3747f92a2 100644
--- a/ykrt/src/trace/hwt/mapper.rs
+++ b/ykrt/src/trace/hwt/mapper.rs
@@ -169,7 +169,18 @@ impl Iterator for HWTTraceIterator {
     type Item = Result<TraceAction, AOTTraceIteratorError>;
     fn next(&mut self) -> Option<Self::Item> {
         if self.tas_generated == 0 {
-            // The first block contains the control point, which we need to remove.
+            // Remove the first block.
+            //
+            // If we are collecting a top-level trace, this removes the remainder of the block
+            // containing the control point.
+            //
+            // If we are side-tracing then this attempts to remove the block containing the failed
+            // guard, which is captured by the hardware tracer, but which we have already executed
+            // in the parent trace. Note though, that some conditionals (e.g. switches) can span
+            // multiple machine blocks, which are not all removed here. Since we don't have enough
+            // information at this level to remove all of them, there's a workaround in the trace
+            // builder.
+            //
             // As a rough proxy for "check that we removed only the thing we want to remove", we know
             // that the control point will be contained in a single mappable block. The `unwrap` can
             // only fail if our assumption about the block is incorrect (i.e. some part of the system