From 06a58e423981f1952489890280d2649bc4cb45a3 Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Thu, 20 Jan 2022 15:40:56 -0800
Subject: [PATCH] Cranelift: Fix cold-blocks-related lowering bug.

If a block is marked cold but has side-effect-free code that is only
used by side-effectful code in non-cold blocks, we will erroneously fail
to emit it, causing a regalloc failure.

This is due to the interaction of block ordering and lowering: we rely
on block ordering to visit uses before defs (except for backedges) so
that we can effectively do an inline liveness analysis and skip lowering
operations that are not used anywhere. This "inline DCE" is needed
because instruction lowering can pattern-match and merge one instruction
into another, removing the need to generate the source instruction.

Unfortunately, the way that I added cold-block support in #3698 was
oblivious to this -- it just changed the block sort order. For
efficiency reasons, we generate code in its final order directly, so it
would not be tenable to generate it in e.g. RPO first and then reorder
cold blocks to the bottom; we really do want to visit in the same order
as the final code.

This PR fixes the bug by moving the point at which cold blocks are sunk
to emission-time instead. This is cheaper than either trying to visit
blocks during lowering in RPO but add to VCode out-of-order, or trying
to do some expensive analysis to recover proper liveness. It's not clear
that the latter would be possible anyway -- the need to lower some
instructions depends on other instructions' isel results/merging
success, so we really do need to visit in RPO, and we can't simply lower
all instructions as side-effecting roots (some can't be toplevel nodes).

The one downside of this approach is that the VCode itself still has
cold blocks inline; so in the text format (and hence compile-tests) it's
not possible to see the sinking. This PR adds a test for cold-block
sinking that actually verifies the machine code. (The test also includes
an add-instruction in the cold path that would have been incorrectly
skipped prior to this fix.)

Fortunately this bug would not have been triggered by the one current
use of cold blocks in #3699, because there the only operation in the
cold block was an (always effectful) call instruction. The worst-case
effect of the bug in other code would be a regalloc panic; no silent
miscompilations could result.

Depends on #3708.
---
 cranelift/codegen/src/isa/x64/mod.rs         | 103 +++++++++++++++++++
 cranelift/codegen/src/machinst/blockorder.rs |  31 +++---
 cranelift/codegen/src/machinst/vcode.rs      |  17 ++-
 3 files changed, 135 insertions(+), 16 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index 6c8782e9cc55..fbd7dad611ca 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -179,3 +179,106 @@ fn isa_constructor(
     let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags);
     Box::new(backend)
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+    use crate::settings;
+    use crate::settings::Configurable;
+    use core::str::FromStr;
+    use target_lexicon::Triple;
+
+    /// We have to test cold blocks by observing final machine code,
+    /// rather than VCode, because the VCode orders blocks in lowering
+    /// order, not emission order. (The exact difference between the
+    /// two is that cold blocks are sunk in the latter.) We might as
+    /// well do the test here, where we have a backend to use.
+    #[test]
+    fn test_cold_blocks() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+        let bb1 = func.dfg.make_block();
+        let bb2 = func.dfg.make_block();
+        let bb3 = func.dfg.make_block();
+        let bb1_param = func.dfg.append_block_param(bb1, I32);
+        let bb3_param = func.dfg.append_block_param(bb3, I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().brnz(v1, bb1, &[v1]);
+        pos.ins().jump(bb2, &[]);
+
+        pos.insert_block(bb1);
+        let v2 = pos.ins().isub(v1, v0);
+        let v3 = pos.ins().iadd(v2, bb1_param);
+        pos.ins().brnz(v1, bb2, &[]);
+        pos.ins().jump(bb3, &[v3]);
+
+        pos.func.layout.set_cold(bb2);
+        pos.insert_block(bb2);
+        let v4 = pos.ins().iadd(v1, v0);
+        pos.ins().brnz(v4, bb2, &[]);
+        pos.ins().jump(bb1, &[v4]);
+
+        pos.insert_block(bb3);
+        pos.ins().return_(&[bb3_param]);
+
+        let mut shared_flags_builder = settings::builder();
+        shared_flags_builder.set("opt_level", "none").unwrap();
+        shared_flags_builder.set("enable_verifier", "true").unwrap();
+        let shared_flags = settings::Flags::new(shared_flags_builder);
+        let isa_flags = x64_settings::Flags::new(&shared_flags, x64_settings::builder());
+        let backend = X64Backend::new_with_flags(
+            Triple::from_str("x86_64").unwrap(),
+            shared_flags,
+            isa_flags,
+        );
+        let result = backend
+            .compile_function(&mut func, /* want_disasm = */ false)
+            .unwrap();
+        let code = result.buffer.data();
+
+        // 00000000  55                push rbp
+        // 00000001  4889E5            mov rbp,rsp
+        // 00000004  4889FE            mov rsi,rdi
+        // 00000007  81C634120000      add esi,0x1234
+        // 0000000D  85F6              test esi,esi
+        // 0000000F  0F841B000000      jz near 0x30
+        // 00000015  4889F7            mov rdi,rsi
+        // 00000018  4889F0            mov rax,rsi
+        // 0000001B  81E834120000      sub eax,0x1234
+        // 00000021  01F8              add eax,edi
+        // 00000023  85F6              test esi,esi
+        // 00000025  0F8505000000      jnz near 0x30
+        // 0000002B  4889EC            mov rsp,rbp
+        // 0000002E  5D                pop rbp
+        // 0000002F  C3                ret
+        // 00000030  4889F7            mov rdi,rsi    <--- cold block
+        // 00000033  81C734120000      add edi,0x1234
+        // 00000039  85FF              test edi,edi
+        // 0000003B  0F85EFFFFFFF      jnz near 0x30
+        // 00000041  E9D2FFFFFF        jmp 0x18
+
+        let golden = vec![
+            85, 72, 137, 229, 72, 137, 254, 129, 198, 52, 18, 0, 0, 133, 246, 15, 132, 27, 0, 0, 0,
+            72, 137, 247, 72, 137, 240, 129, 232, 52, 18, 0, 0, 1, 248, 133, 246, 15, 133, 5, 0, 0,
+            0, 72, 137, 236, 93, 195, 72, 137, 247, 129, 199, 52, 18, 0, 0, 133, 255, 15, 133, 239,
+            255, 255, 255, 233, 210, 255, 255, 255,
+        ];
+
+        assert_eq!(code, &golden[..]);
+    }
+}
diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs
index 3cdf1228ca1f..f09d0cf9681b 100644
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -99,6 +99,8 @@ pub struct BlockLoweringOrder {
     /// blocks.
     #[allow(dead_code)]
     orig_map: SecondaryMap<Block, Option<BlockIndex>>,
+    /// Cold-block flags for each lowered block.
+    cold_block: Vec<bool>,
 }
 
 /// The origin of a block in the lowered block-order: either an original CLIF
@@ -378,31 +380,26 @@ impl BlockLoweringOrder {
 
         postorder.reverse();
         let mut rpo = postorder;
-
-        // Step 3: sink any cold blocks to the end of the
-        // function. Put the "deferred last" block truly at the end;
-        // this is a correctness requirement (for fallthrough
-        // returns).
-        rpo.sort_by_key(|block| {
-            block
-                .0
-                .orig_block()
-                .map(|block| f.layout.is_cold(block))
-                .unwrap_or(false)
-        });
-
         if let Some(d) = deferred_last {
             rpo.push(d);
         }
 
-        // Step 4: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
+        // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps.
         let mut lowered_order = vec![];
+        let mut cold_block = vec![];
         let mut lowered_succ_ranges = vec![];
         let mut lb_to_bindex = FxHashMap::default();
         for (block, succ_range) in rpo.into_iter() {
             lb_to_bindex.insert(block, lowered_order.len() as BlockIndex);
             lowered_order.push(block);
             lowered_succ_ranges.push(succ_range);
+
+            let is_cold = if let Some(b) = block.orig_block() {
+                f.layout.is_cold(b)
+            } else {
+                false
+            };
+            cold_block.push(is_cold);
         }
 
         let lowered_succ_indices = lowered_succs
@@ -424,6 +421,7 @@ impl BlockLoweringOrder {
             lowered_succ_indices,
             lowered_succ_ranges,
             orig_map,
+            cold_block,
         };
         log::trace!("BlockLoweringOrder: {:?}", result);
         result
@@ -439,6 +437,11 @@ impl BlockLoweringOrder {
         let range = self.lowered_succ_ranges[block as usize];
         &self.lowered_succ_indices[range.0..range.1]
     }
+
+    /// Determine whether the given lowered-block index is cold.
+    pub fn is_cold(&self, block: BlockIndex) -> bool {
+        self.cold_block[block as usize]
+    }
 }
 
 #[cfg(test)]
diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs
index 0e8300f49d58..3043e193472b 100644
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -470,11 +470,24 @@ impl<I: VCodeInst> VCode<I> {
         let mut inst_ends = vec![0; self.insts.len()];
         let mut label_insn_iix = vec![0; self.num_blocks()];
 
+        // Construct the final order we emit code in: cold blocks at the end.
+        let mut final_order: SmallVec<[BlockIndex; 16]> = smallvec![];
+        let mut cold_blocks: SmallVec<[BlockIndex; 16]> = smallvec![];
+        for block in 0..self.num_blocks() {
+            let block = block as BlockIndex;
+            if self.block_order.is_cold(block) {
+                cold_blocks.push(block);
+            } else {
+                final_order.push(block);
+            }
+        }
+        final_order.extend(cold_blocks);
+
+        // Emit blocks.
         let mut safepoint_idx = 0;
         let mut cur_srcloc = None;
         let mut last_offset = None;
-        for block in 0..self.num_blocks() {
-            let block = block as BlockIndex;
+        for block in final_order {
             let new_offset = I::align_basic_block(buffer.cur_offset());
             while new_offset > buffer.cur_offset() {
                 // Pad with NOPs up to the aligned block offset.