diff --git a/build.rs b/build.rs index ea6e6c64071b..3eeb27c1796f 100644 --- a/build.rs +++ b/build.rs @@ -194,6 +194,17 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("reference_types", "table_copy_on_imported_tables") => return false, ("reference_types", _) => return true, + ("misc_testsuite", "export_large_signature") + | ("spec_testsuite", "call") + | ("multi_value", "call") + | ("multi_value", "func") => { + // FIXME These involves functions with very large stack frames that Cranelift currently + // cannot compile using the fastcall (Windows) calling convention. + // See https://github.com/bytecodealliance/wasmtime/pull/1216. + #[cfg(windows)] + return true; + } + _ => {} }, _ => panic!("unrecognized strategy"), diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs index ca70293c05fb..967d302ecb9f 100644 --- a/cranelift/codegen/src/context.rs +++ b/cranelift/codegen/src/context.rs @@ -206,8 +206,8 @@ impl Context { isa: &dyn TargetIsa, kind: FrameUnwindKind, sink: &mut dyn FrameUnwindSink, - ) { - isa.emit_unwind_info(&self.func, kind, sink); + ) -> CodegenResult<()> { + isa.emit_unwind_info(&self.func, kind, sink) } /// Run the verifier on the function. diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index 9c91d4219390..af263c2b5bf0 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -398,8 +398,9 @@ pub trait TargetIsa: fmt::Display + Send + Sync { _func: &ir::Function, _kind: binemit::FrameUnwindKind, _sink: &mut dyn binemit::FrameUnwindSink, - ) { + ) -> CodegenResult<()> { // No-op by default + Ok(()) } } diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index db67457a6c76..c683f101ed2d 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -12,8 +12,10 @@ use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion}; use crate::binemit::{FrameUnwindKind, FrameUnwindSink}; use crate::cursor::{Cursor, CursorPosition, EncCursor}; use crate::ir; +use crate::ir::entities::StackSlot; use crate::ir::immediates::Imm64; use crate::ir::stackslot::{StackOffset, StackSize}; +use crate::ir::types; use crate::ir::{ get_probestack_funcref, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, FrameLayoutChange, InstBuilder, ValueLoc, @@ -23,6 +25,7 @@ use crate::regalloc::RegisterSet; use crate::result::CodegenResult; use crate::stack_layout::layout_stack; use alloc::borrow::Cow; +use alloc::vec::Vec; use core::i32; use std::boxed::Box; use target_lexicon::{PointerWidth, Triple}; @@ -366,17 +369,18 @@ pub fn allocatable_registers(triple: &Triple, flags: &shared_settings::Flags) -> regs } -/// Get the set of callee-saved registers. +/// Get the set of callee-saved general-purpose registers. fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] { match isa.triple().pointer_width().unwrap() { PointerWidth::U16 => panic!(), PointerWidth::U32 => &[RU::rbx, RU::rsi, RU::rdi], PointerWidth::U64 => { if call_conv.extends_windows_fastcall() { - // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 are considered nonvolatile - // and must be saved and restored by a function that uses them." + // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 are + // considered nonvolatile and must be saved and restored by a function that uses + // them." // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention - // RSP & RSB are not listed below, since they are restored automatically during + // RSP & RBP are not listed below, since they are restored automatically during // a function call. If that wasn't the case, function calls (RET) would not work. &[ RU::rbx, @@ -394,12 +398,45 @@ fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] } } +/// Get the set of callee-saved floating-point (SIMD) registers. +fn callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] { + match isa.triple().pointer_width().unwrap() { + PointerWidth::U16 => panic!(), + PointerWidth::U32 => &[], + PointerWidth::U64 => { + if call_conv.extends_windows_fastcall() { + // "registers RBX, ... , and XMM6-15 are considered nonvolatile and must be saved + // and restored by a function that uses them." + // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention as of + // February 5th, 2020. + &[ + RU::xmm6, + RU::xmm7, + RU::xmm8, + RU::xmm9, + RU::xmm10, + RU::xmm11, + RU::xmm12, + RU::xmm13, + RU::xmm14, + RU::xmm15, + ] + } else { + &[] + } + } + } +} + /// Get the set of callee-saved registers that are used. -fn callee_saved_gprs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet { +fn callee_saved_regs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet { let mut all_callee_saved = RegisterSet::empty(); for reg in callee_saved_gprs(isa, func.signature.call_conv) { all_callee_saved.free(GPR, *reg as RegUnit); } + for reg in callee_saved_fprs(isa, func.signature.call_conv) { + all_callee_saved.free(FPR, *reg as RegUnit); + } let mut used = RegisterSet::empty(); for value_loc in func.locations.values() { @@ -407,8 +444,14 @@ fn callee_saved_gprs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterS // register. We don't use registers that overlap each other in the x86 ISA, but in others // we do. So this should not be blindly reused. if let ValueLoc::Reg(ru) = *value_loc { - if !used.is_avail(GPR, ru) { - used.free(GPR, ru); + if GPR.contains(ru) { + if !used.is_avail(GPR, ru) { + used.free(GPR, ru); + } + } else if FPR.contains(ru) { + if !used.is_avail(FPR, ru) { + used.free(FPR, ru); + } } } } @@ -424,8 +467,14 @@ fn callee_saved_gprs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterS match func.dfg[inst] { ir::instructions::InstructionData::RegMove { dst, .. } | ir::instructions::InstructionData::RegFill { dst, .. } => { - if !used.is_avail(GPR, dst) { - used.free(GPR, dst); + if GPR.contains(dst) { + if !used.is_avail(GPR, dst) { + used.free(GPR, dst); + } + } else if FPR.contains(dst) { + if !used.is_avail(FPR, dst) { + used.free(FPR, dst); + } } } _ => (), @@ -509,7 +558,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C panic!("TODO: windows-fastcall: x86-32 not implemented yet"); } - let csrs = callee_saved_gprs_used(isa, func); + let csrs = callee_saved_regs_used(isa, func); // The reserved stack area is composed of: // return address + frame pointer + all callee-saved registers + shadow space @@ -519,11 +568,28 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // will adjust the stack pointer to make room for the rest of the required // space for this frame. let word_size = isa.pointer_bytes() as usize; + let num_fprs = csrs.iter(FPR).len(); let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; + // Only create an FPR stack slot if we're going to save FPRs. + let fpr_slot = if num_fprs > 0 { + // Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it + // seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be + // through `stack_load` and `stack_store` (see later comment about issue #1198). Even + // though in a certain light FPR preserve/restore is "spilling" an argument, regalloc + // implies that `SpillSlot` may be eligible for certain optimizations, and we know with + // certainty that this space may not be reused in the function, nor moved around. + Some(func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::ExplicitSlot, + size: (num_fprs * types::F64X2.bytes() as usize) as u32, + offset: None, + })) + } else { + None + }; + // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work // since cranelift does not support spill slots before incoming args - func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, size: csr_stack_size as u32, @@ -544,8 +610,22 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C func.signature.params.push(fp_arg); func.signature.returns.push(fp_arg); - for csr in csrs.iter(GPR) { - let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr); + for gp_csr in csrs.iter(GPR) { + let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, gp_csr); + func.signature.params.push(csr_arg); + func.signature.returns.push(csr_arg); + } + + for fp_csr in csrs.iter(FPR) { + // The calling convention described in + // https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only requires + // preserving the low 128 bits of XMM6-XMM15. + // + // TODO: For now, add just an `F64` rather than `F64X2` because `F64X2` would require + // encoding a fstDisp8 with REX bits set, and we currently can't encode that. F64 causes a + // whole XMM register to be preserved anyway. + let csr_arg = + ir::AbiParam::special_reg(types::F64, ir::ArgumentPurpose::CalleeSaved, fp_csr); func.signature.params.push(csr_arg); func.signature.returns.push(csr_arg); } @@ -553,8 +633,14 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Set up the cursor and insert the prologue let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); - let prologue_cfa_state = - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); + let prologue_cfa_state = insert_common_prologue( + &mut pos, + local_stack_size, + reg_type, + &csrs, + fpr_slot.as_ref(), + isa, + ); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); @@ -563,6 +649,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, + fpr_slot.as_ref(), isa, prologue_cfa_state, ); @@ -575,7 +662,11 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C let pointer_width = isa.triple().pointer_width().unwrap(); let word_size = pointer_width.bytes() as usize; - let csrs = callee_saved_gprs_used(isa, func); + let csrs = callee_saved_regs_used(isa, func); + assert!( + csrs.iter(FPR).len() == 0, + "SysV ABI does not have callee-save SIMD registers" + ); // The reserved stack area is composed of: // return address + frame pointer + all callee-saved registers @@ -615,7 +706,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); let prologue_cfa_state = - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); + insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); @@ -624,6 +715,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, + None, isa, prologue_cfa_state, ); @@ -638,6 +730,7 @@ fn insert_common_prologue( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, + fpr_slot: Option<&StackSlot>, isa: &dyn TargetIsa, ) -> Option { let word_size = isa.pointer_bytes() as isize; @@ -648,8 +741,11 @@ fn insert_common_prologue( // pushed CSRs, frame pointer. // Also, the size of a return address, implicitly pushed by a x86 `call` instruction, // also should be accounted for. + // If any FPR are present, count them as well as necessary alignment space. // TODO: Check if the function body actually contains a `call` instruction. - let total_stack_size = (csrs.iter(GPR).len() + 1 + 1) as i64 * word_size as i64; + let mut total_stack_size = (csrs.iter(GPR).len() + 1 + 1) as i64 * word_size as i64; + + total_stack_size += csrs.iter(FPR).len() as i64 * types::F64X2.bytes() as i64; insert_stack_check(pos, total_stack_size, stack_limit_arg); } @@ -796,6 +892,55 @@ fn insert_common_prologue( } } + // Now that RSP is prepared for the function, we can use stack slots: + if let Some(fpr_slot) = fpr_slot { + debug_assert!(csrs.iter(FPR).len() != 0); + + // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base + // address. We are well after postopt could run, so load the CSR region base once here, + // instead of hoping that the addr/store will be combined later. + // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 + let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); + + // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at + // function entry. + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); + + let mut fpr_offset = 0; + + for reg in csrs.iter(FPR) { + // Append param to entry Block + let csr_arg = pos.func.dfg.append_block_param(block, types::F64); + + // Since regalloc has already run, we must assign a location. + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); + + let reg_store_inst = + pos.ins() + .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset); + + // If we preserve FPRs, they occur after SP is adjusted, so also fix up the end point + // to this new instruction. + pos.func.prologue_end = Some(reg_store_inst); + fpr_offset += types::F64X2.bytes() as i32; + + if let Some(ref mut frame_layout) = pos.func.frame_layout { + let mut cfa_state = cfa_state + .as_mut() + .expect("cfa state exists when recording frame layout"); + cfa_state.current_depth -= types::F64X2.bytes() as isize; + frame_layout.instructions.insert( + reg_store_inst, + vec![FrameLayoutChange::RegAt { + reg, + cfa_offset: cfa_state.current_depth, + }] + .into_boxed_slice(), + ); + } + } + } + cfa_state } @@ -828,6 +973,7 @@ fn insert_common_epilogues( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, + fpr_slot: Option<&StackSlot>, isa: &dyn TargetIsa, cfa_state: Option, ) { @@ -842,6 +988,7 @@ fn insert_common_epilogues( pos, reg_type, csrs, + fpr_slot, isa, is_last, cfa_state.clone(), @@ -859,11 +1006,57 @@ fn insert_common_epilogue( pos: &mut EncCursor, reg_type: ir::types::Type, csrs: &RegisterSet, + fpr_slot: Option<&StackSlot>, isa: &dyn TargetIsa, is_last: bool, mut cfa_state: Option, ) { let word_size = isa.pointer_bytes() as isize; + + // Even though instructions to restore FPRs are inserted first, we have to append them after + // restored GPRs to satisfy parameter order in the return. + let mut restored_fpr_values = Vec::new(); + + // Restore FPRs before we move RSP and invalidate stack slots. + if let Some(fpr_slot) = fpr_slot { + debug_assert!(csrs.iter(FPR).len() != 0); + + // `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base + // address. We are well after postopt could run, so load the CSR region base once here, + // instead of hoping that the addr/store will be combined later. + // + // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 + let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); + + // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at + // function exit. + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); + + let mut fpr_offset = 0; + + for reg in csrs.iter(FPR) { + let value = pos + .ins() + .load(types::F64, ir::MemFlags::trusted(), stack_addr, fpr_offset); + fpr_offset += types::F64X2.bytes() as i32; + + if let Some(ref mut cfa_state) = cfa_state.as_mut() { + // Note: don't bother recording a frame layout change because the popped value is + // still correct in memory, and won't be overwritten until we've returned where the + // current frame's layout would no longer matter. Only adjust `current_depth` for a + // consistency check later. + cfa_state.current_depth += types::F64X2.bytes() as isize; + } + // Unlike GPRs before, we don't need to step back after reach restoration because FPR + // restoration is order-insensitive. Furthermore: we want GPR restoration to begin + // after FPR restoration, so that stack adjustments occur after we're done relying on + // StackSlot validity. + + pos.func.locations[value] = ir::ValueLoc::Reg(reg); + restored_fpr_values.push(value); + } + } + if stack_size > 0 { pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)); } @@ -903,6 +1096,10 @@ fn insert_common_epilogue( pos.func.dfg.append_inst_arg(inst, csr_ret); } + for value in restored_fpr_values.into_iter() { + pos.func.dfg.append_inst_arg(inst, value); + } + if let Some(ref mut frame_layout) = pos.func.frame_layout { let cfa_state = cfa_state .as_mut() @@ -953,19 +1150,21 @@ pub fn emit_unwind_info( isa: &dyn TargetIsa, kind: FrameUnwindKind, sink: &mut dyn FrameUnwindSink, -) { +) -> CodegenResult<()> { match kind { FrameUnwindKind::Fastcall => { // Assumption: RBP is being used as the frame pointer // In the future, Windows fastcall codegen should usually omit the frame pointer - if let Some(info) = UnwindInfo::try_from_func(func, isa, Some(RU::rbp.into())) { + if let Some(info) = UnwindInfo::try_from_func(func, isa, Some(RU::rbp.into()))? { info.emit(sink); } } FrameUnwindKind::Libunwind => { if func.frame_layout.is_some() { - emit_fde(func, isa, sink); + emit_fde(func, isa, sink)?; } } } + + Ok(()) } diff --git a/cranelift/codegen/src/isa/x86/fde.rs b/cranelift/codegen/src/isa/x86/fde.rs index 9d6e38de31af..85ed5b5f2a75 100644 --- a/cranelift/codegen/src/isa/x86/fde.rs +++ b/cranelift/codegen/src/isa/x86/fde.rs @@ -4,6 +4,7 @@ use crate::binemit::{FrameUnwindOffset, FrameUnwindSink, Reloc}; use crate::ir::{FrameLayoutChange, Function}; use crate::isa::fde::RegisterMappingError; use crate::isa::{CallConv, RegUnit, TargetIsa}; +use crate::result::CodegenResult; use alloc::vec::Vec; use core::convert::TryInto; use gimli::write::{ @@ -178,7 +179,11 @@ fn to_cfi( } /// Creates FDE structure from FrameLayout. -pub fn emit_fde(func: &Function, isa: &dyn TargetIsa, sink: &mut dyn FrameUnwindSink) { +pub fn emit_fde( + func: &Function, + isa: &dyn TargetIsa, + sink: &mut dyn FrameUnwindSink, +) -> CodegenResult<()> { assert!(isa.name() == "x86"); // Expecting function with System V prologue @@ -266,6 +271,8 @@ pub fn emit_fde(func: &Function, isa: &dyn TargetIsa, sink: &mut dyn FrameUnwind // Need 0 marker for GCC unwind to end FDE "list". sink.bytes(&[0, 0, 0, 0]); + + Ok(()) } #[cfg(test)] @@ -314,7 +321,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let mut sink = SimpleUnwindSink(Vec::new(), 0, Vec::new()); - emit_fde(&context.func, &*isa, &mut sink); + emit_fde(&context.func, &*isa, &mut sink).expect("can emit fde"); assert_eq!( sink.0, @@ -376,7 +383,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let mut sink = SimpleUnwindSink(Vec::new(), 0, Vec::new()); - emit_fde(&context.func, &*isa, &mut sink); + emit_fde(&context.func, &*isa, &mut sink).expect("can emit fde"); assert_eq!( sink.0, diff --git a/cranelift/codegen/src/isa/x86/mod.rs b/cranelift/codegen/src/isa/x86/mod.rs index 042874ea69e5..881f12bdb1b4 100644 --- a/cranelift/codegen/src/isa/x86/mod.rs +++ b/cranelift/codegen/src/isa/x86/mod.rs @@ -177,8 +177,8 @@ impl TargetIsa for Isa { func: &ir::Function, kind: FrameUnwindKind, sink: &mut dyn FrameUnwindSink, - ) { - abi::emit_unwind_info(func, self, kind, sink); + ) -> CodegenResult<()> { + abi::emit_unwind_info(func, self, kind, sink) } } diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index de653b89a68f..707b9e6d2c2b 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -1,11 +1,13 @@ //! Unwind information for x64 Windows. -use super::registers::{GPR, RU}; +use super::registers::{FPR, GPR, RU}; use crate::binemit::FrameUnwindSink; -use crate::ir::{Function, InstructionData, Opcode}; +use crate::ir::{Function, InstructionData, Opcode, ValueLoc}; use crate::isa::{CallConv, RegUnit, TargetIsa}; +use crate::result::{CodegenError, CodegenResult}; use alloc::vec::Vec; use byteorder::{ByteOrder, LittleEndian}; +use log::warn; /// Maximum (inclusive) size of a "small" stack allocation const SMALL_ALLOC_MAX_SIZE: u32 = 128; @@ -35,18 +37,34 @@ fn write_u32(sink: &mut dyn FrameUnwindSink, v: u32) { /// Note: the Cranelift x86 ISA RU enum matches the Windows unwind GPR encoding values. #[derive(Debug, PartialEq, Eq)] enum UnwindCode { - PushRegister { offset: u8, reg: RegUnit }, - StackAlloc { offset: u8, size: u32 }, - SetFramePointer { offset: u8, sp_offset: u8 }, + PushRegister { + offset: u8, + reg: RegUnit, + }, + SaveXmm { + offset: u8, + reg: RegUnit, + stack_offset: u32, + }, + StackAlloc { + offset: u8, + size: u32, + }, + SetFramePointer { + offset: u8, + sp_offset: u8, + }, } impl UnwindCode { fn emit(&self, sink: &mut dyn FrameUnwindSink) { enum UnwindOperation { - PushNonvolatileRegister, - LargeStackAlloc, - SmallStackAlloc, - SetFramePointer, + PushNonvolatileRegister = 0, + LargeStackAlloc = 1, + SmallStackAlloc = 2, + SetFramePointer = 3, + SaveXmm128 = 8, + SaveXmm128Far = 9, } match self { @@ -58,6 +76,28 @@ impl UnwindCode { | (UnwindOperation::PushNonvolatileRegister as u8), ); } + Self::SaveXmm { + offset, + reg, + stack_offset, + } => { + write_u8(sink, *offset); + let stack_offset = stack_offset / 16; + if stack_offset <= core::u16::MAX as u32 { + write_u8( + sink, + (FPR.index_of(*reg) << 4) as u8 | (UnwindOperation::SaveXmm128 as u8), + ); + write_u16::(sink, stack_offset as u16); + } else { + write_u8( + sink, + (FPR.index_of(*reg) << 4) as u8 | (UnwindOperation::SaveXmm128Far as u8), + ); + write_u16::(sink, stack_offset as u16); + write_u16::(sink, (stack_offset >> 16) as u16); + } + } Self::StackAlloc { offset, size } => { // Stack allocations on Windows must be a multiple of 8 and be at least 1 slot assert!(*size >= 8); @@ -98,6 +138,13 @@ impl UnwindCode { 3 } } + Self::SaveXmm { stack_offset, .. } => { + if *stack_offset <= core::u16::MAX as u32 { + 2 + } else { + 3 + } + } _ => 1, } } @@ -121,10 +168,10 @@ impl UnwindInfo { func: &Function, isa: &dyn TargetIsa, frame_register: Option, - ) -> Option { + ) -> CodegenResult> { // Only Windows fastcall is supported for unwind information if func.signature.call_conv != CallConv::WindowsFastcall || func.prologue_end.is_none() { - return None; + return Ok(None); } let prologue_end = func.prologue_end.unwrap(); @@ -136,10 +183,27 @@ impl UnwindInfo { let mut unwind_codes = Vec::new(); let mut found_end = false; + // Have we saved at least one FPR? if so, we might have to check additional constraints. + let mut saved_fpr = false; + + // In addition to the min offset for a callee-save, we need to know the offset from the + // frame base to the stack pointer, so that we can record an unwind offset that spans only + // to the end of callee-save space. + let mut static_frame_allocation_size = 0u32; + + // For the time being, FPR preservation is split into a stack_addr and later store/load. + // Store the register used for stack store and ensure it is the same register with no + // intervening changes to the frame size. + let mut callee_save_region_reg = None; + // Also record the callee-save region's offset from RSP, because it must be added to FPR + // save offsets to compute an offset from the frame base. + let mut callee_save_offset = None; + for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length if (offset + size) > 255 { - panic!("function prologues cannot exceed 255 bytes in size for Windows x64"); + warn!("function prologues cannot exceed 255 bytes in size for Windows x64"); + return Err(CodegenError::CodeTooLarge); } prologue_size += size; @@ -150,18 +214,23 @@ impl UnwindInfo { InstructionData::Unary { opcode, arg } => { match opcode { Opcode::X86Push => { + static_frame_allocation_size += 8; + unwind_codes.push(UnwindCode::PushRegister { offset: unwind_offset, reg: func.locations[arg].unwrap_reg(), }); } Opcode::AdjustSpDown => { + let stack_size = + stack_size.expect("expected a previous stack size instruction"); + static_frame_allocation_size += stack_size; + // This is used when calling a stack check function // We need to track the assignment to RAX which has the size of the stack unwind_codes.push(UnwindCode::StackAlloc { offset: unwind_offset, - size: stack_size - .expect("expected a previous stack size instruction"), + size: stack_size, }); } _ => {} @@ -170,6 +239,10 @@ impl UnwindInfo { InstructionData::CopySpecial { src, dst, .. } => { if let Some(frame_register) = frame_register { if src == (RU::rsp as RegUnit) && dst == frame_register { + // Constructing an rbp-based stack frame, so the static frame + // allocation restarts at 0 from here. + static_frame_allocation_size = 0; + unwind_codes.push(UnwindCode::SetFramePointer { offset: unwind_offset, sp_offset: 0, @@ -194,6 +267,8 @@ impl UnwindInfo { let imm: i64 = imm.into(); assert!(imm <= core::u32::MAX as i64); + static_frame_allocation_size += imm as u32; + unwind_codes.push(UnwindCode::StackAlloc { offset: unwind_offset, size: imm as u32, @@ -202,6 +277,55 @@ impl UnwindInfo { _ => {} } } + InstructionData::StackLoad { + opcode: Opcode::StackAddr, + stack_slot, + offset: _, + } => { + let result = func.dfg.inst_results(inst).get(0).unwrap(); + if let ValueLoc::Reg(frame_reg) = func.locations[*result] { + callee_save_region_reg = Some(frame_reg); + + // Figure out the offset in the call frame that `frame_reg` will have. + let frame_size = func + .stack_slots + .layout_info + .expect("func's stack slots have layout info if stack operations exist") + .frame_size; + // Because we're well after the prologue has been constructed, stack slots + // must have been laid out... + let slot_offset = func.stack_slots[stack_slot] + .offset + .expect("callee-save slot has an offset computed"); + let frame_offset = frame_size as i32 + slot_offset; + + callee_save_offset = Some(frame_offset as u32); + } + } + InstructionData::Store { + opcode: Opcode::Store, + args: [arg1, arg2], + flags: _flags, + offset, + } => { + if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) = + (func.locations[arg1], func.locations[arg2]) + { + if Some(base_ru) == callee_save_region_reg { + let offset_int: i32 = offset.into(); + assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error"); + let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset"); + if FPR.contains(ru) { + saved_fpr = true; + unwind_codes.push(UnwindCode::SaveXmm { + offset: unwind_offset, + reg: ru, + stack_offset: offset_int, + }); + } + } + } + } _ => {} }; @@ -212,16 +336,46 @@ impl UnwindInfo { } if !found_end { - return None; + return Ok(None); } - Some(Self { + if saved_fpr { + if static_frame_allocation_size > 240 && saved_fpr { + warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \ + This is a Cranelift implementation limit, see \ + https://github.com/bytecodealliance/wasmtime/issues/1475", + static_frame_allocation_size); + return Err(CodegenError::ImplLimitExceeded); + } + // Only test static frame size is 16-byte aligned when an FPR is saved to avoid + // panicking when alignment is elided because no FPRs are saved and no child calls are + // made. + assert!( + static_frame_allocation_size % 16 == 0, + "static frame allocation must be a multiple of 16" + ); + } + + // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at + // one end of the call frame, and RSP at the other, required offsets are arbitrarily large. + // Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large + // frames are inexpressible, and we cannot actually compile the function. In case there are + // no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing + // will actually check it. This, then, avoids panics when compiling functions with large + // call frames. + let reported_frame_offset = if saved_fpr { + (static_frame_allocation_size / 16) as u8 + } else { + 0 + }; + + Ok(Some(Self { flags: 0, // this assumes cranelift functions have no SEH handlers prologue_size: prologue_size as u8, frame_register, - frame_register_offset: 0, + frame_register_offset: reported_frame_offset, unwind_codes, - }) + })) } pub fn size(&self) -> usize { @@ -320,7 +474,10 @@ mod tests { context.compile(&*isa).expect("expected compilation"); - assert_eq!(UnwindInfo::try_from_func(&context.func, &*isa, None), None); + assert_eq!( + UnwindInfo::try_from_func(&context.func, &*isa, None).expect("can emit unwind info"), + None + ); } #[test] @@ -337,6 +494,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let unwind = UnwindInfo::try_from_func(&context.func, &*isa, Some(RU::rbp.into())) + .expect("can emit unwind info") .expect("expected unwind info"); assert_eq!( @@ -401,6 +559,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let unwind = UnwindInfo::try_from_func(&context.func, &*isa, Some(RU::rbp.into())) + .expect("can emit unwind info") .expect("expected unwind info"); assert_eq!( @@ -465,6 +624,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let unwind = UnwindInfo::try_from_func(&context.func, &*isa, Some(RU::rbp.into())) + .expect("can emit unwind info") .expect("expected unwind info"); assert_eq!( diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 55a6c59bed2e..917d179ccf74 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -32,6 +32,31 @@ block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64): } ; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check that we preserve xmm6 and above if we're using them locally +function %float_callee_saves(f64, f64, f64, f64) windows_fastcall { +block0(v0: f64, v1: f64, v2: f64, v3: f64): +; explicitly use a callee-save register +[-, %xmm6] v4 = fadd v0, v1 +[-, %xmm7] v5 = fadd v0, v1 + return +} +; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7]) -> i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7] windows_fastcall { +; nextln: ss0 = explicit_slot 32, offset -80 +; nextln: ss1 = incoming_arg 16, offset -48 +; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64 [%xmm6], v9: f64 [%xmm7]): +; nextln: x86_push v6 +; nextln: copy_special %rsp -> %rbp +; nextln: adjust_sp_down_imm 64 +; nextln: v7 = stack_addr.i64 ss0 +; nextln: store notrap aligned v8, v7 +; nextln: store notrap aligned v9, v7+16 +; check: v10 = stack_addr.i64 ss0 +; nextln: v11 = load.f64 notrap aligned v10 +; nextln: v12 = load.f64 notrap aligned v10+16 +; nextln: adjust_sp_up_imm 64 +; nextln: v13 = x86_pop.i64 +; nextln: v13, v11, v12 + function %mixed_int_float(i64, f64, i64, f32) windows_fastcall { block0(v0: i64, v1: f64, v2: i64, v3: f32): return @@ -43,3 +68,29 @@ block0(v0: f32, v1: f64, v2: i64, v3: i64): return v1 } ; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall { + +function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall { + fn0 = %foo(i64, i64, i64, i64) -> i64 + fn1 = %foo2(i64, i64, i64, i64) -> i64 +block0(v0: i64): + v1 = load.i64 v0+0 + v2 = load.i64 v0+8 + v3 = load.i64 v0+16 + v4 = load.i64 v0+24 + v5 = load.i64 v0+32 + v6 = load.i64 v0+40 + v7 = load.i64 v0+48 + v8 = load.i64 v0+56 + v9 = load.i64 v0+64 + v10 = call fn0(v1, v2, v3, v4) + store.i64 v1, v0+8 + store.i64 v2, v0+16 + store.i64 v3, v0+24 + store.i64 v4, v0+32 + store.i64 v5, v0+40 + store.i64 v6, v0+48 + store.i64 v7, v0+56 + store.i64 v8, v0+64 + store.i64 v9, v0+72 + return v10 +} diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index b146f0ac7696..1119c550128f 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -118,6 +118,53 @@ block0: ; nextln: ], ; nextln: } +function %fpr_with_function_call(i64, i64) windows_fastcall { + fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall; +block0(v0: i64, v1: i64): + v2 = load.f64 v0+0 + v3 = load.f64 v0+8 + v4 = load.i64 v0+16 + v15 = load.f64 v0+104 + v16 = load.f64 v0+112 + v17 = load.f64 v0+120 + v18 = load.f64 v0+128 + v19 = load.f64 v0+136 + v20 = load.f64 v0+144 + v21 = load.f64 v0+152 + v22 = load.f64 v0+160 + v23 = load.f64 v0+168 + call fn0(v2, v3, v4, v1, v1) + store.f64 v15, v1+104 + store.f64 v16, v1+112 + store.f64 v17, v1+120 + store.f64 v18, v1+128 + store.f64 v19, v1+136 + store.f64 v20, v1+144 + store.f64 v21, v1+152 + store.f64 v22, v1+160 + store.f64 v23, v1+168 + return +} +; Only check the first unwind code here because this test specifically looks to +; see that in a function that is not a leaf, a callee-save FPR is stored in an +; area that does not overlap either the callee's shadow space or stack argument +; space. +; +; sameln: UnwindInfo { +; nextln: version: 1, +; nextln: flags: 0, +; nextln: prologue_size: 26, +; nextln: unwind_code_count_raw: 7, +; nextln: frame_register: 5, +; nextln: frame_register_offset: 12, +; nextln: unwind_codes: [ +; nextln: UnwindCode { +; nextln: offset: 26, +; nextln: op: SaveXmm128, +; nextln: info: 15, +; nextln: value: U16( +; nextln: 3, + ; check a function that has CSRs function %lots_of_registers(i64, i64) windows_fastcall { block0(v0: i64, v1: i64): @@ -134,6 +181,15 @@ block0(v0: i64, v1: i64): v12 = load.i32 v0+80 v13 = load.i32 v0+88 v14 = load.i32 v0+96 + v15 = load.f64 v0+104 + v16 = load.f64 v0+112 + v17 = load.f64 v0+120 + v18 = load.f64 v0+128 + v19 = load.f64 v0+136 + v20 = load.f64 v0+144 + v21 = load.f64 v0+152 + v22 = load.f64 v0+160 + v23 = load.f64 v0+168 store.i32 v2, v1+0 store.i32 v3, v1+8 store.i32 v4, v1+16 @@ -147,20 +203,53 @@ block0(v0: i64, v1: i64): store.i32 v12, v1+80 store.i32 v13, v1+88 store.i32 v14, v1+96 + store.f64 v15, v1+104 + store.f64 v16, v1+112 + store.f64 v17, v1+120 + store.f64 v18, v1+128 + store.f64 v19, v1+136 + store.f64 v20, v1+144 + store.f64 v21, v1+152 + store.f64 v22, v1+160 + store.f64 v23, v1+168 return } ; sameln: UnwindInfo { ; nextln: version: 1, ; nextln: flags: 0, -; nextln: prologue_size: 19, -; nextln: unwind_code_count_raw: 10, +; nextln: prologue_size: 44, +; nextln: unwind_code_count_raw: 16, ; nextln: frame_register: 5, -; nextln: frame_register_offset: 0, +; nextln: frame_register_offset: 10, ; nextln: unwind_codes: [ ; nextln: UnwindCode { +; nextln: offset: 44, +; nextln: op: SaveXmm128, +; nextln: info: 8, +; nextln: value: U16( +; nextln: 2, +; nextln: ), +; nextln: }, +; nextln: UnwindCode { +; nextln: offset: 38, +; nextln: op: SaveXmm128, +; nextln: info: 7, +; nextln: value: U16( +; nextln: 1, +; nextln: ), +; nextln: }, +; nextln: UnwindCode { +; nextln: offset: 32, +; nextln: op: SaveXmm128, +; nextln: info: 6, +; nextln: value: U16( +; nextln: 0, +; nextln: ), +; nextln: }, +; nextln: UnwindCode { ; nextln: offset: 19, ; nextln: op: SmallStackAlloc, -; nextln: info: 3, +; nextln: info: 12, ; nextln: value: None, ; nextln: }, ; nextln: UnwindCode { diff --git a/cranelift/filetests/src/test_fde.rs b/cranelift/filetests/src/test_fde.rs index 3e3747fdde2c..5a9305479bba 100644 --- a/cranelift/filetests/src/test_fde.rs +++ b/cranelift/filetests/src/test_fde.rs @@ -64,7 +64,9 @@ impl SubTest for TestUnwind { } let mut sink = SimpleUnwindSink(Vec::new(), 0, Vec::new()); - comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Libunwind, &mut sink); + comp_ctx + .emit_unwind_info(isa, FrameUnwindKind::Libunwind, &mut sink) + .expect("can emit unwind info"); let mut text = String::new(); if sink.0.is_empty() { diff --git a/cranelift/filetests/src/test_unwind.rs b/cranelift/filetests/src/test_unwind.rs index 3db1cbf8299e..bf51cb73db0f 100644 --- a/cranelift/filetests/src/test_unwind.rs +++ b/cranelift/filetests/src/test_unwind.rs @@ -59,7 +59,9 @@ impl SubTest for TestUnwind { } let mut sink = Sink(Vec::new()); - comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Fastcall, &mut sink); + comp_ctx + .emit_unwind_info(isa, FrameUnwindKind::Fastcall, &mut sink) + .expect("can emit unwind info"); let mut text = String::new(); if sink.0.is_empty() { @@ -177,15 +179,15 @@ impl UnwindCode { #[derive(Debug)] enum UnwindOperation { - PushNonvolatileRegister, - LargeStackAlloc, - SmallStackAlloc, - SetFramePointer, - SaveNonvolatileRegister, - SaveNonvolatileRegisterFar, - SaveXmm128, - SaveXmm128Far, - PushMachineFrame, + PushNonvolatileRegister = 0, + LargeStackAlloc = 1, + SmallStackAlloc = 2, + SetFramePointer = 3, + SaveNonvolatileRegister = 4, + SaveNonvolatileRegisterFar = 5, + SaveXmm128 = 8, + SaveXmm128Far = 9, + PushMachineFrame = 10, } impl From for UnwindOperation { @@ -198,9 +200,9 @@ impl From for UnwindOperation { 3 => Self::SetFramePointer, 4 => Self::SaveNonvolatileRegister, 5 => Self::SaveNonvolatileRegisterFar, - 6 => Self::SaveXmm128, - 7 => Self::SaveXmm128Far, - 8 => Self::PushMachineFrame, + 8 => Self::SaveXmm128, + 9 => Self::SaveXmm128Far, + 10 => Self::PushMachineFrame, _ => panic!("unsupported unwind operation"), } } diff --git a/crates/api/src/trampoline/func.rs b/crates/api/src/trampoline/func.rs index 4d64482deacc..fbfa0d3e811a 100644 --- a/crates/api/src/trampoline/func.rs +++ b/crates/api/src/trampoline/func.rs @@ -188,7 +188,9 @@ fn make_trampoline( .map_err(|error| pretty_error(&context.func, Some(isa), error)) .expect("compile_and_emit"); - let unwind_info = CompiledFunctionUnwindInfo::new(isa, &context); + let unwind_info = CompiledFunctionUnwindInfo::new(isa, &context) + .map_err(|error| pretty_error(&context.func, Some(isa), error)) + .expect("emit unwind info"); code_memory .allocate_for_function(&CompiledFunction { diff --git a/crates/environ/src/compilation.rs b/crates/environ/src/compilation.rs index 8c7bde36215d..82f8786d1792 100644 --- a/crates/environ/src/compilation.rs +++ b/crates/environ/src/compilation.rs @@ -4,7 +4,7 @@ use crate::cache::ModuleCacheDataTupleType; use crate::CacheConfig; use crate::ModuleTranslation; -use cranelift_codegen::{binemit, ir, isa, Context}; +use cranelift_codegen::{binemit, ir, isa, CodegenResult, Context}; use cranelift_entity::PrimaryMap; use cranelift_wasm::{DefinedFuncIndex, FuncIndex, WasmError}; use serde::{Deserialize, Serialize}; @@ -36,7 +36,7 @@ pub enum CompiledFunctionUnwindInfo { impl CompiledFunctionUnwindInfo { /// Constructs unwind info object. - pub fn new(isa: &dyn isa::TargetIsa, context: &Context) -> Self { + pub fn new(isa: &dyn isa::TargetIsa, context: &Context) -> CodegenResult { use cranelift_codegen::binemit::{ FrameUnwindKind, FrameUnwindOffset, FrameUnwindSink, Reloc, }; @@ -75,24 +75,26 @@ impl CompiledFunctionUnwindInfo { CallConv::SystemV | CallConv::Fast | CallConv::Cold => FrameUnwindKind::Libunwind, CallConv::WindowsFastcall => FrameUnwindKind::Fastcall, _ => { - return CompiledFunctionUnwindInfo::None; + return Ok(CompiledFunctionUnwindInfo::None); } }; let mut sink = Sink(Vec::new(), 0, Vec::new()); - context.emit_unwind_info(isa, kind, &mut sink); + context.emit_unwind_info(isa, kind, &mut sink)?; let Sink(data, offset, relocs) = sink; if data.is_empty() { - return CompiledFunctionUnwindInfo::None; + return Ok(CompiledFunctionUnwindInfo::None); } - match kind { + let info = match kind { FrameUnwindKind::Fastcall => CompiledFunctionUnwindInfo::Windows(data), FrameUnwindKind::Libunwind => { CompiledFunctionUnwindInfo::FrameLayout(data, offset, relocs) } - } + }; + + Ok(info) } /// Retuns true is no unwind info data. diff --git a/crates/environ/src/cranelift.rs b/crates/environ/src/cranelift.rs index 061457b1d50f..7f979c12a5a7 100644 --- a/crates/environ/src/cranelift.rs +++ b/crates/environ/src/cranelift.rs @@ -264,7 +264,9 @@ fn compile(env: CompileEnv<'_>) -> Result