From ef1b2d2fa81f02d559da1b493d5ddff6d8f26e6d Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Thu, 20 Jan 2022 15:40:56 -0800 Subject: [PATCH] Cranelift: Fix cold-blocks-related lowering bug. If a block is marked cold but has side-effect-free code that is only used by side-effectful code in non-cold blocks, we will erroneously fail to emit it, causing a regalloc failure. This is due to the interaction of block ordering and lowering: we rely on block ordering to visit uses before defs (except for backedges) so that we can effectively do an inline liveness analysis and skip lowering operations that are not used anywhere. This "inline DCE" is needed because instruction lowering can pattern-match and merge one instruction into another, removing the need to generate the source instruction. Unfortunately, the way that I added cold-block support in #3698 was oblivious to this -- it just changed the block sort order. For efficiency reasons, we generate code in its final order directly, so it would not be tenable to generate it in e.g. RPO first and then reorder cold blocks to the bottom; we really do want to visit in the same order as the final code. This PR fixes the bug by moving the point at which cold blocks are sunk to emission-time instead. This is cheaper than either trying to visit blocks during lowering in RPO but add to VCode out-of-order, or trying to do some expensive analysis to recover proper liveness. It's not clear that the latter would be possible anyway -- the need to lower some instructions depends on other instructions' isel results/merging success, so we really do need to visit in RPO, and we can't simply lower all instructions as side-effecting roots (some can't be toplevel nodes). The one downside of this approach is that the VCode itself still has cold blocks inline; so in the text format (and hence compile-tests) it's not possible to see the sinking. This PR adds a test for cold-block sinking that actually verifies the machine code. (The test also includes an add-instruction in the cold path that would have been incorrectly skipped prior to this fix.) Fortunately this bug would not have been triggered by the one current use of cold blocks in #3699, because there the only operation in the cold block was an (always effectful) call instruction. The worst-case effect of the bug in other code would be a regalloc panic; no silent miscompilations could result. --- cranelift/codegen/src/isa/x64/mod.rs | 103 +++++++++++++++++++ cranelift/codegen/src/machinst/blockorder.rs | 40 ++++--- cranelift/codegen/src/machinst/vcode.rs | 17 ++- 3 files changed, 143 insertions(+), 17 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs index 6c8782e9cc55..fbd7dad611ca 100644 --- a/cranelift/codegen/src/isa/x64/mod.rs +++ b/cranelift/codegen/src/isa/x64/mod.rs @@ -179,3 +179,106 @@ fn isa_constructor( let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags); Box::new(backend) } + +#[cfg(test)] +mod test { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::types::*; + use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature}; + use crate::isa::CallConv; + use crate::settings; + use crate::settings::Configurable; + use core::str::FromStr; + use target_lexicon::Triple; + + /// We have to test cold blocks by observing final machine code, + /// rather than VCode, because the VCode orders blocks in lowering + /// order, not emission order. (The exact difference between the + /// two is that cold blocks are sunk in the latter.) We might as + /// well do the test here, where we have a backend to use. + #[test] + fn test_cold_blocks() { + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + sig.returns.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + + let bb0 = func.dfg.make_block(); + let arg0 = func.dfg.append_block_param(bb0, I32); + let bb1 = func.dfg.make_block(); + let bb2 = func.dfg.make_block(); + let bb3 = func.dfg.make_block(); + let bb1_param = func.dfg.append_block_param(bb1, I32); + let bb3_param = func.dfg.append_block_param(bb3, I32); + + let mut pos = FuncCursor::new(&mut func); + + pos.insert_block(bb0); + let v0 = pos.ins().iconst(I32, 0x1234); + let v1 = pos.ins().iadd(arg0, v0); + pos.ins().brnz(v1, bb1, &[v1]); + pos.ins().jump(bb2, &[]); + + pos.insert_block(bb1); + let v2 = pos.ins().isub(v1, v0); + let v3 = pos.ins().iadd(v2, bb1_param); + pos.ins().brnz(v1, bb2, &[]); + pos.ins().jump(bb3, &[v3]); + + pos.func.layout.set_cold(bb2); + pos.insert_block(bb2); + let v4 = pos.ins().iadd(v1, v0); + pos.ins().brnz(v4, bb2, &[]); + pos.ins().jump(bb1, &[v4]); + + pos.insert_block(bb3); + pos.ins().return_(&[bb3_param]); + + let mut shared_flags_builder = settings::builder(); + shared_flags_builder.set("opt_level", "none").unwrap(); + shared_flags_builder.set("enable_verifier", "true").unwrap(); + let shared_flags = settings::Flags::new(shared_flags_builder); + let isa_flags = x64_settings::Flags::new(&shared_flags, x64_settings::builder()); + let backend = X64Backend::new_with_flags( + Triple::from_str("x86_64").unwrap(), + shared_flags, + isa_flags, + ); + let result = backend + .compile_function(&mut func, /* want_disasm = */ false) + .unwrap(); + let code = result.buffer.data(); + + // 00000000 55 push rbp + // 00000001 4889E5 mov rbp,rsp + // 00000004 4889FE mov rsi,rdi + // 00000007 81C634120000 add esi,0x1234 + // 0000000D 85F6 test esi,esi + // 0000000F 0F841B000000 jz near 0x30 + // 00000015 4889F7 mov rdi,rsi + // 00000018 4889F0 mov rax,rsi + // 0000001B 81E834120000 sub eax,0x1234 + // 00000021 01F8 add eax,edi + // 00000023 85F6 test esi,esi + // 00000025 0F8505000000 jnz near 0x30 + // 0000002B 4889EC mov rsp,rbp + // 0000002E 5D pop rbp + // 0000002F C3 ret + // 00000030 4889F7 mov rdi,rsi <--- cold block + // 00000033 81C734120000 add edi,0x1234 + // 00000039 85FF test edi,edi + // 0000003B 0F85EFFFFFFF jnz near 0x30 + // 00000041 E9D2FFFFFF jmp 0x18 + + let golden = vec![ + 85, 72, 137, 229, 72, 137, 254, 129, 198, 52, 18, 0, 0, 133, 246, 15, 132, 27, 0, 0, 0, + 72, 137, 247, 72, 137, 240, 129, 232, 52, 18, 0, 0, 1, 248, 133, 246, 15, 133, 5, 0, 0, + 0, 72, 137, 236, 93, 195, 72, 137, 247, 129, 199, 52, 18, 0, 0, 133, 255, 15, 133, 239, + 255, 255, 255, 233, 210, 255, 255, 255, + ]; + + assert_eq!(code, &golden[..]); + } +} diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs index 3cdf1228ca1f..5c805c4029a8 100644 --- a/cranelift/codegen/src/machinst/blockorder.rs +++ b/cranelift/codegen/src/machinst/blockorder.rs @@ -99,6 +99,13 @@ pub struct BlockLoweringOrder { /// blocks. #[allow(dead_code)] orig_map: SecondaryMap>, + /// Cold blocks. These blocks are not reordered in the + /// `lowered_order` above; the lowered order must respect RPO + /// (uses after defs) in order for lowering to be + /// correct. Instead, this set is used to provide `is_cold()`, + /// which is used by VCode emission to sink the blocks at the last + /// moment (when we actually emit bytes into the MachBuffer). + cold_blocks: FxHashSet, } /// The origin of a block in the lowered block-order: either an original CLIF @@ -378,31 +385,28 @@ impl BlockLoweringOrder { postorder.reverse(); let mut rpo = postorder; - - // Step 3: sink any cold blocks to the end of the - // function. Put the "deferred last" block truly at the end; - // this is a correctness requirement (for fallthrough - // returns). - rpo.sort_by_key(|block| { - block - .0 - .orig_block() - .map(|block| f.layout.is_cold(block)) - .unwrap_or(false) - }); - if let Some(d) = deferred_last { rpo.push(d); } - // Step 4: now that we have RPO, build the BlockIndex/BB fwd/rev maps. + // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps. let mut lowered_order = vec![]; + let mut cold_blocks = FxHashSet::default(); let mut lowered_succ_ranges = vec![]; let mut lb_to_bindex = FxHashMap::default(); for (block, succ_range) in rpo.into_iter() { - lb_to_bindex.insert(block, lowered_order.len() as BlockIndex); + let index = lowered_order.len() as BlockIndex; + lb_to_bindex.insert(block, index); lowered_order.push(block); lowered_succ_ranges.push(succ_range); + + if block + .orig_block() + .map(|b| f.layout.is_cold(b)) + .unwrap_or(false) + { + cold_blocks.insert(index); + } } let lowered_succ_indices = lowered_succs @@ -424,6 +428,7 @@ impl BlockLoweringOrder { lowered_succ_indices, lowered_succ_ranges, orig_map, + cold_blocks, }; log::trace!("BlockLoweringOrder: {:?}", result); result @@ -439,6 +444,11 @@ impl BlockLoweringOrder { let range = self.lowered_succ_ranges[block as usize]; &self.lowered_succ_indices[range.0..range.1] } + + /// Determine whether the given lowered-block index is cold. + pub fn is_cold(&self, block: BlockIndex) -> bool { + self.cold_blocks.contains(&block) + } } #[cfg(test)] diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs index 0e8300f49d58..3043e193472b 100644 --- a/cranelift/codegen/src/machinst/vcode.rs +++ b/cranelift/codegen/src/machinst/vcode.rs @@ -470,11 +470,24 @@ impl VCode { let mut inst_ends = vec![0; self.insts.len()]; let mut label_insn_iix = vec![0; self.num_blocks()]; + // Construct the final order we emit code in: cold blocks at the end. + let mut final_order: SmallVec<[BlockIndex; 16]> = smallvec![]; + let mut cold_blocks: SmallVec<[BlockIndex; 16]> = smallvec![]; + for block in 0..self.num_blocks() { + let block = block as BlockIndex; + if self.block_order.is_cold(block) { + cold_blocks.push(block); + } else { + final_order.push(block); + } + } + final_order.extend(cold_blocks); + + // Emit blocks. let mut safepoint_idx = 0; let mut cur_srcloc = None; let mut last_offset = None; - for block in 0..self.num_blocks() { - let block = block as BlockIndex; + for block in final_order { let new_offset = I::align_basic_block(buffer.cur_offset()); while new_offset > buffer.cur_offset() { // Pad with NOPs up to the aligned block offset.