diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs index 6c8782e9cc55..fbd7dad611ca 100644 --- a/cranelift/codegen/src/isa/x64/mod.rs +++ b/cranelift/codegen/src/isa/x64/mod.rs @@ -179,3 +179,106 @@ fn isa_constructor( let backend = X64Backend::new_with_flags(triple, shared_flags, isa_flags); Box::new(backend) } + +#[cfg(test)] +mod test { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::types::*; + use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature}; + use crate::isa::CallConv; + use crate::settings; + use crate::settings::Configurable; + use core::str::FromStr; + use target_lexicon::Triple; + + /// We have to test cold blocks by observing final machine code, + /// rather than VCode, because the VCode orders blocks in lowering + /// order, not emission order. (The exact difference between the + /// two is that cold blocks are sunk in the latter.) We might as + /// well do the test here, where we have a backend to use. + #[test] + fn test_cold_blocks() { + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + sig.returns.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + + let bb0 = func.dfg.make_block(); + let arg0 = func.dfg.append_block_param(bb0, I32); + let bb1 = func.dfg.make_block(); + let bb2 = func.dfg.make_block(); + let bb3 = func.dfg.make_block(); + let bb1_param = func.dfg.append_block_param(bb1, I32); + let bb3_param = func.dfg.append_block_param(bb3, I32); + + let mut pos = FuncCursor::new(&mut func); + + pos.insert_block(bb0); + let v0 = pos.ins().iconst(I32, 0x1234); + let v1 = pos.ins().iadd(arg0, v0); + pos.ins().brnz(v1, bb1, &[v1]); + pos.ins().jump(bb2, &[]); + + pos.insert_block(bb1); + let v2 = pos.ins().isub(v1, v0); + let v3 = pos.ins().iadd(v2, bb1_param); + pos.ins().brnz(v1, bb2, &[]); + pos.ins().jump(bb3, &[v3]); + + pos.func.layout.set_cold(bb2); + pos.insert_block(bb2); + let v4 = pos.ins().iadd(v1, v0); + pos.ins().brnz(v4, bb2, &[]); + pos.ins().jump(bb1, &[v4]); + + pos.insert_block(bb3); + pos.ins().return_(&[bb3_param]); + + let mut shared_flags_builder = settings::builder(); + shared_flags_builder.set("opt_level", "none").unwrap(); + shared_flags_builder.set("enable_verifier", "true").unwrap(); + let shared_flags = settings::Flags::new(shared_flags_builder); + let isa_flags = x64_settings::Flags::new(&shared_flags, x64_settings::builder()); + let backend = X64Backend::new_with_flags( + Triple::from_str("x86_64").unwrap(), + shared_flags, + isa_flags, + ); + let result = backend + .compile_function(&mut func, /* want_disasm = */ false) + .unwrap(); + let code = result.buffer.data(); + + // 00000000 55 push rbp + // 00000001 4889E5 mov rbp,rsp + // 00000004 4889FE mov rsi,rdi + // 00000007 81C634120000 add esi,0x1234 + // 0000000D 85F6 test esi,esi + // 0000000F 0F841B000000 jz near 0x30 + // 00000015 4889F7 mov rdi,rsi + // 00000018 4889F0 mov rax,rsi + // 0000001B 81E834120000 sub eax,0x1234 + // 00000021 01F8 add eax,edi + // 00000023 85F6 test esi,esi + // 00000025 0F8505000000 jnz near 0x30 + // 0000002B 4889EC mov rsp,rbp + // 0000002E 5D pop rbp + // 0000002F C3 ret + // 00000030 4889F7 mov rdi,rsi <--- cold block + // 00000033 81C734120000 add edi,0x1234 + // 00000039 85FF test edi,edi + // 0000003B 0F85EFFFFFFF jnz near 0x30 + // 00000041 E9D2FFFFFF jmp 0x18 + + let golden = vec![ + 85, 72, 137, 229, 72, 137, 254, 129, 198, 52, 18, 0, 0, 133, 246, 15, 132, 27, 0, 0, 0, + 72, 137, 247, 72, 137, 240, 129, 232, 52, 18, 0, 0, 1, 248, 133, 246, 15, 133, 5, 0, 0, + 0, 72, 137, 236, 93, 195, 72, 137, 247, 129, 199, 52, 18, 0, 0, 133, 255, 15, 133, 239, + 255, 255, 255, 233, 210, 255, 255, 255, + ]; + + assert_eq!(code, &golden[..]); + } +} diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs index 3cdf1228ca1f..5c805c4029a8 100644 --- a/cranelift/codegen/src/machinst/blockorder.rs +++ b/cranelift/codegen/src/machinst/blockorder.rs @@ -99,6 +99,13 @@ pub struct BlockLoweringOrder { /// blocks. #[allow(dead_code)] orig_map: SecondaryMap>, + /// Cold blocks. These blocks are not reordered in the + /// `lowered_order` above; the lowered order must respect RPO + /// (uses after defs) in order for lowering to be + /// correct. Instead, this set is used to provide `is_cold()`, + /// which is used by VCode emission to sink the blocks at the last + /// moment (when we actually emit bytes into the MachBuffer). + cold_blocks: FxHashSet, } /// The origin of a block in the lowered block-order: either an original CLIF @@ -378,31 +385,28 @@ impl BlockLoweringOrder { postorder.reverse(); let mut rpo = postorder; - - // Step 3: sink any cold blocks to the end of the - // function. Put the "deferred last" block truly at the end; - // this is a correctness requirement (for fallthrough - // returns). - rpo.sort_by_key(|block| { - block - .0 - .orig_block() - .map(|block| f.layout.is_cold(block)) - .unwrap_or(false) - }); - if let Some(d) = deferred_last { rpo.push(d); } - // Step 4: now that we have RPO, build the BlockIndex/BB fwd/rev maps. + // Step 3: now that we have RPO, build the BlockIndex/BB fwd/rev maps. let mut lowered_order = vec![]; + let mut cold_blocks = FxHashSet::default(); let mut lowered_succ_ranges = vec![]; let mut lb_to_bindex = FxHashMap::default(); for (block, succ_range) in rpo.into_iter() { - lb_to_bindex.insert(block, lowered_order.len() as BlockIndex); + let index = lowered_order.len() as BlockIndex; + lb_to_bindex.insert(block, index); lowered_order.push(block); lowered_succ_ranges.push(succ_range); + + if block + .orig_block() + .map(|b| f.layout.is_cold(b)) + .unwrap_or(false) + { + cold_blocks.insert(index); + } } let lowered_succ_indices = lowered_succs @@ -424,6 +428,7 @@ impl BlockLoweringOrder { lowered_succ_indices, lowered_succ_ranges, orig_map, + cold_blocks, }; log::trace!("BlockLoweringOrder: {:?}", result); result @@ -439,6 +444,11 @@ impl BlockLoweringOrder { let range = self.lowered_succ_ranges[block as usize]; &self.lowered_succ_indices[range.0..range.1] } + + /// Determine whether the given lowered-block index is cold. + pub fn is_cold(&self, block: BlockIndex) -> bool { + self.cold_blocks.contains(&block) + } } #[cfg(test)] diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs index 0e8300f49d58..3043e193472b 100644 --- a/cranelift/codegen/src/machinst/vcode.rs +++ b/cranelift/codegen/src/machinst/vcode.rs @@ -470,11 +470,24 @@ impl VCode { let mut inst_ends = vec![0; self.insts.len()]; let mut label_insn_iix = vec![0; self.num_blocks()]; + // Construct the final order we emit code in: cold blocks at the end. + let mut final_order: SmallVec<[BlockIndex; 16]> = smallvec![]; + let mut cold_blocks: SmallVec<[BlockIndex; 16]> = smallvec![]; + for block in 0..self.num_blocks() { + let block = block as BlockIndex; + if self.block_order.is_cold(block) { + cold_blocks.push(block); + } else { + final_order.push(block); + } + } + final_order.extend(cold_blocks); + + // Emit blocks. let mut safepoint_idx = 0; let mut cur_srcloc = None; let mut last_offset = None; - for block in 0..self.num_blocks() { - let block = block as BlockIndex; + for block in final_order { let new_offset = I::align_basic_block(buffer.cur_offset()); while new_offset > buffer.cur_offset() { // Pad with NOPs up to the aligned block offset.