From 4543e234fd4f62d890d4181a3f2e3a44c78aaa99 Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 5 Feb 2020 22:07:00 -0800 Subject: [PATCH 01/41] initial brush at preserving SIMD registers in Windows fastcall functions --- cranelift/codegen/src/isa/x86/abi.rs | 151 ++++++++++++++++-- .../codegen/src/regalloc/register_set.rs | 5 + .../isa/x86/windows_fastcall_x64.clif | 10 ++ 3 files changed, 150 insertions(+), 16 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index db67457a6c76..bc2e51949e8a 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -12,8 +12,9 @@ use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion}; use crate::binemit::{FrameUnwindKind, FrameUnwindSink}; use crate::cursor::{Cursor, CursorPosition, EncCursor}; use crate::ir; -use crate::ir::immediates::Imm64; +use crate::ir::immediates::{Imm64, Offset32}; use crate::ir::stackslot::{StackOffset, StackSize}; +use crate::ir::types; use crate::ir::{ get_probestack_funcref, AbiParam, ArgumentExtension, ArgumentLoc, ArgumentPurpose, FrameLayoutChange, InstBuilder, ValueLoc, @@ -366,17 +367,18 @@ pub fn allocatable_registers(triple: &Triple, flags: &shared_settings::Flags) -> regs } -/// Get the set of callee-saved registers. +/// Get the set of callee-saved registers general-purpose registers. fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] { match isa.triple().pointer_width().unwrap() { PointerWidth::U16 => panic!(), PointerWidth::U32 => &[RU::rbx, RU::rsi, RU::rdi], PointerWidth::U64 => { if call_conv.extends_windows_fastcall() { - // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 are considered nonvolatile - // and must be saved and restored by a function that uses them." + // "registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 are + // considered nonvolatile and must be saved and restored by a function that uses + // them." // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention - // RSP & RSB are not listed below, since they are restored automatically during + // RSP & RBP are not listed below, since they are restored automatically during // a function call. If that wasn't the case, function calls (RET) would not work. &[ RU::rbx, @@ -394,12 +396,45 @@ fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] } } +/// Get the set of callee-saved floating-point (SIMD) registers. +fn callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] { + match isa.triple().pointer_width().unwrap() { + PointerWidth::U16 => panic!(), + PointerWidth::U32 => &[], + PointerWidth::U64 => { + if call_conv.extends_windows_fastcall() { + // "registers RBX, ... , and XMM6-15 are considered nonvolatile and must be saved + // and restored by a function that uses them." + // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention as of + // 02-05-2020. + &[ + RU::xmm6, + RU::xmm7, + RU::xmm8, + RU::xmm9, + RU::xmm10, + RU::xmm11, + RU::xmm12, + RU::xmm13, + RU::xmm14, + RU::xmm15, + ] + } else { + &[] + } + } + } +} + /// Get the set of callee-saved registers that are used. -fn callee_saved_gprs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet { +fn callee_saved_regs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterSet { let mut all_callee_saved = RegisterSet::empty(); for reg in callee_saved_gprs(isa, func.signature.call_conv) { all_callee_saved.free(GPR, *reg as RegUnit); } + for reg in callee_saved_fprs(isa, func.signature.call_conv) { + all_callee_saved.free(FPR, *reg as RegUnit); + } let mut used = RegisterSet::empty(); for value_loc in func.locations.values() { @@ -407,8 +442,14 @@ fn callee_saved_gprs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterS // register. We don't use registers that overlap each other in the x86 ISA, but in others // we do. So this should not be blindly reused. if let ValueLoc::Reg(ru) = *value_loc { - if !used.is_avail(GPR, ru) { - used.free(GPR, ru); + if GPR.contains(ru) { + if !used.is_avail(GPR, ru) { + used.free(GPR, ru); + } + } else if FPR.contains(ru) { + if !used.is_avail(FPR, ru) { + used.free(FPR, ru); + } } } } @@ -424,8 +465,14 @@ fn callee_saved_gprs_used(isa: &dyn TargetIsa, func: &ir::Function) -> RegisterS match func.dfg[inst] { ir::instructions::InstructionData::RegMove { dst, .. } | ir::instructions::InstructionData::RegFill { dst, .. } => { - if !used.is_avail(GPR, dst) { - used.free(GPR, dst); + if GPR.contains(dst) { + if !used.is_avail(GPR, dst) { + used.free(GPR, dst); + } + } else if FPR.contains(dst) { + if !used.is_avail(FPR, dst) { + used.free(FPR, dst); + } } } _ => (), @@ -509,17 +556,21 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C panic!("TODO: windows-fastcall: x86-32 not implemented yet"); } - let csrs = callee_saved_gprs_used(isa, func); + let csrs = callee_saved_regs_used(isa, func); // The reserved stack area is composed of: // return address + frame pointer + all callee-saved registers + shadow space // + // XMM registers are twice the size of general purpose registers, so count them + // twice for stack space accounting. + // // Pushing the return address is an implicit function of the `call` // instruction. Each of the others we will then push explicitly. Then we // will adjust the stack pointer to make room for the rest of the required // space for this frame. let word_size = isa.pointer_bytes() as usize; - let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; + let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size + + csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as i32; // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work // since cranelift does not support spill slots before incoming args @@ -544,8 +595,20 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C func.signature.params.push(fp_arg); func.signature.returns.push(fp_arg); - for csr in csrs.iter(GPR) { - let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, csr); + for gp_csr in csrs.iter(GPR) { + let csr_arg = ir::AbiParam::special_reg(reg_type, ir::ArgumentPurpose::CalleeSaved, gp_csr); + func.signature.params.push(csr_arg); + func.signature.returns.push(csr_arg); + } + + for fp_csr in csrs.iter(FPR) { + // The calling convention described in + // https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only specifically + // discusses XMM6-XMM15, which are 128-bit registers. It may be sufficient to preserve only + // the low 128 bits here, but we may find that in practice we should preserve all of + // YMM6-15 (or even ZMM6-15?) + let csr_arg = + ir::AbiParam::special_reg(types::F64X2, ir::ArgumentPurpose::CalleeSaved, fp_csr); func.signature.params.push(csr_arg); func.signature.returns.push(csr_arg); } @@ -575,7 +638,11 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C let pointer_width = isa.triple().pointer_width().unwrap(); let word_size = pointer_width.bytes() as usize; - let csrs = callee_saved_gprs_used(isa, func); + let csrs = callee_saved_regs_used(isa, func); + assert!( + csrs.iter(FPR).len() == 0, + "SysV ABI does not have callee-save SIMD registers" + ); // The reserved stack area is composed of: // return address + frame pointer + all callee-saved registers @@ -649,7 +716,8 @@ fn insert_common_prologue( // Also, the size of a return address, implicitly pushed by a x86 `call` instruction, // also should be accounted for. // TODO: Check if the function body actually contains a `call` instruction. - let total_stack_size = (csrs.iter(GPR).len() + 1 + 1) as i64 * word_size as i64; + let total_stack_size = (csrs.iter(GPR).len() + 1 + 1) as i64 * word_size as i64 + + csrs.iter(FPR).len() as i64 * types::F64X2.bytes() as i64; insert_stack_check(pos, total_stack_size, stack_limit_arg); } @@ -752,6 +820,36 @@ fn insert_common_prologue( } } + for reg in csrs.iter(FPR) { + // Append param to entry EBB + let csr_arg = pos.func.dfg.append_ebb_param(ebb, types::F64X2); + + // Assign it a location + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); + + let reg_store_inst = pos.ins().store( + ir::MemFlags::trusted(), + csr_arg, + panic!("get RSP value"), + Offset32::new(panic!("value-appropriate offset")), + ); + + if let Some(ref mut frame_layout) = pos.func.frame_layout { + let mut cfa_state = cfa_state + .as_mut() + .expect("cfa state exists when recording frame layout"); + cfa_state.current_depth -= types::F64X2.bytes() as isize; + frame_layout.instructions.insert( + reg_store_inst, + vec![FrameLayoutChange::RegAt { + reg, + cfa_offset: cfa_state.current_depth, + }] + .into_boxed_slice(), + ); + } + } + // Allocate stack frame storage. if stack_size > 0 { if isa.flags().enable_probestack() && stack_size > (1 << isa.flags().probestack_size_log2()) @@ -903,6 +1001,27 @@ fn insert_common_epilogue( pos.func.dfg.append_inst_arg(inst, csr_ret); } + for reg in csrs.iter(FPR) { + let value = pos.ins().load( + types::F64X2, + ir::MemFlags::trusted(), + panic!("get RSP value"), + Offset32::new(panic!("value-appropriate offset")), + ); + + if let Some(ref mut cfa_state) = cfa_state.as_mut() { + // Note: don't bother recording a frame layout change because the popped value is + // still correct in memory, and won't be overwritten until we've returned where the + // current frame's layout would no longer matter. Only adjust `current_depth` for a + // consistency check later. + cfa_state.current_depth += types::F64X2.bytes() as isize; + } + pos.prev_inst(); + + pos.func.locations[value] = ir::ValueLoc::Reg(reg); + pos.func.dfg.append_inst_arg(inst, value); + } + if let Some(ref mut frame_layout) = pos.func.frame_layout { let cfa_state = cfa_state .as_mut() diff --git a/cranelift/codegen/src/regalloc/register_set.rs b/cranelift/codegen/src/regalloc/register_set.rs index 52b8a6fa0a10..ff363f505a75 100644 --- a/cranelift/codegen/src/regalloc/register_set.rs +++ b/cranelift/codegen/src/regalloc/register_set.rs @@ -46,6 +46,11 @@ impl RegisterSet { Self { avail: [0; 3] } } + /// Returns `true` if all registers in this set are available. + pub fn is_empty(&self) -> bool { + self.avail == [0; 3] + } + /// Returns `true` if the specified register is available. pub fn is_avail(&self, rc: RegClass, reg: RegUnit) -> bool { let (idx, bits) = bitmask(rc, reg); diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 55a6c59bed2e..54c66f89b7c3 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -32,6 +32,16 @@ block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64): } ; check: function %five_args(i64 [%rcx], i64 [%rdx], i64 [%r8], i64 [%r9], i64 [32], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check that we preserve xmm6 and above if we're using them locally +function %float_callee_saves(f64, f64, f64, f64) windows_fastcall { +ebb0(v0: f64, v1: f64, v2: f64, v3: f64): +[-, %xmm6] v4 = fadd v0, v1 ; bin: 66 0f db f3 +;[-, %xmm6] v5 = fadd v4, v2 ; bin: 66 0f db f3 +;[-, %xmm6] v6 = fadd v5, v3 ; bin: 66 0f db f3 + return +} +; check: function %float_callee_saves(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], f64 [%xmm6], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { + function %mixed_int_float(i64, f64, i64, f32) windows_fastcall { block0(v0: i64, v1: f64, v2: i64, v3: f32): return From 218ff4f436cb0e24df071e9e9827397723dbace5 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 6 Feb 2020 11:14:19 -0800 Subject: [PATCH 02/41] remove outdated comment and align CSR space if any SIMD registers are preserved --- cranelift/codegen/src/isa/x86/abi.rs | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index bc2e51949e8a..76eced3be421 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -561,16 +561,20 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // The reserved stack area is composed of: // return address + frame pointer + all callee-saved registers + shadow space // - // XMM registers are twice the size of general purpose registers, so count them - // twice for stack space accounting. - // // Pushing the return address is an implicit function of the `call` // instruction. Each of the others we will then push explicitly. Then we // will adjust the stack pointer to make room for the rest of the required // space for this frame. let word_size = isa.pointer_bytes() as usize; - let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size - + csrs.iter(FPR).len() * types::F64X2.bytes() as usize) as i32; + let num_fprs = csrs.iter(FPR).len(); + let mut csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; + + if num_fprs != 0 { + csr_stack_size += (num_fprs * types::F64X2.bytes() as usize) as i32; + + // Ensure that csr stack space is 16-byte aligned for ideal SIMD value placement. + csr_stack_size = csr_stack_size & !0b1111 + 16; + } // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work // since cranelift does not support spill slots before incoming args @@ -715,9 +719,17 @@ fn insert_common_prologue( // pushed CSRs, frame pointer. // Also, the size of a return address, implicitly pushed by a x86 `call` instruction, // also should be accounted for. + // If any FPR are present, count them as well as necessary alignment space. // TODO: Check if the function body actually contains a `call` instruction. - let total_stack_size = (csrs.iter(GPR).len() + 1 + 1) as i64 * word_size as i64 - + csrs.iter(FPR).len() as i64 * types::F64X2.bytes() as i64; + let mut total_stack_size = (csrs.iter(GPR).len() + 1 + 1) as i64 * word_size as i64; + + let num_fpr = csrs.iter(FPR).len(); + if num_fpr > 0 { + total_stack_size += csrs.iter(FPR).len() as i64 * types::F64X2.bytes() as i64; + + // Ensure that csr stack space is 16-byte aligned for ideal SIMD value placement. + total_stack_size = total_stack_size & !0b1111 + 16; + } insert_stack_check(pos, total_stack_size, stack_limit_arg); } From dd558cb710b4fa7a326903c1964aa5fcde6104c8 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 6 Feb 2020 14:15:35 -0800 Subject: [PATCH 03/41] x86: preserve/restore callee-save xmm registers --- cranelift/codegen/src/isa/x86/abi.rs | 128 +++++++++++------- .../isa/x86/windows_fastcall_x64.clif | 19 ++- 2 files changed, 93 insertions(+), 54 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 76eced3be421..ab0056dba4b0 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -12,7 +12,8 @@ use crate::abi::{legalize_args, ArgAction, ArgAssigner, ValueConversion}; use crate::binemit::{FrameUnwindKind, FrameUnwindSink}; use crate::cursor::{Cursor, CursorPosition, EncCursor}; use crate::ir; -use crate::ir::immediates::{Imm64, Offset32}; +use crate::ir::entities::StackSlot; +use crate::ir::immediates::Imm64; use crate::ir::stackslot::{StackOffset, StackSize}; use crate::ir::types; use crate::ir::{ @@ -367,7 +368,7 @@ pub fn allocatable_registers(triple: &Triple, flags: &shared_settings::Flags) -> regs } -/// Get the set of callee-saved registers general-purpose registers. +/// Get the set of callee-saved general-purpose registers. fn callee_saved_gprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] { match isa.triple().pointer_width().unwrap() { PointerWidth::U16 => panic!(), @@ -579,7 +580,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work // since cranelift does not support spill slots before incoming args - func.create_stack_slot(ir::StackSlotData { + let csr_slot = func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, size: csr_stack_size as u32, offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)), @@ -621,7 +622,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); let prologue_cfa_state = - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); + insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, &csr_slot, isa); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); @@ -630,6 +631,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, + &csr_slot, isa, prologue_cfa_state, ); @@ -656,7 +658,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // will adjust the stack pointer to make room for the rest of the required // space for this frame. let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; - func.create_stack_slot(ir::StackSlotData { + let csr_slot = func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, size: csr_stack_size as u32, offset: Some(-csr_stack_size), @@ -686,7 +688,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); let prologue_cfa_state = - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, isa); + insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, &csr_slot, isa); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); @@ -695,6 +697,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, + &csr_slot, isa, prologue_cfa_state, ); @@ -709,6 +712,7 @@ fn insert_common_prologue( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, + csr_slot: &StackSlot, isa: &dyn TargetIsa, ) -> Option { let word_size = isa.pointer_bytes() as isize; @@ -832,33 +836,44 @@ fn insert_common_prologue( } } - for reg in csrs.iter(FPR) { - // Append param to entry EBB - let csr_arg = pos.func.dfg.append_ebb_param(ebb, types::F64X2); - - // Assign it a location - pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); - - let reg_store_inst = pos.ins().store( - ir::MemFlags::trusted(), - csr_arg, - panic!("get RSP value"), - Offset32::new(panic!("value-appropriate offset")), - ); - - if let Some(ref mut frame_layout) = pos.func.frame_layout { - let mut cfa_state = cfa_state - .as_mut() - .expect("cfa state exists when recording frame layout"); - cfa_state.current_depth -= types::F64X2.bytes() as isize; - frame_layout.instructions.insert( - reg_store_inst, - vec![FrameLayoutChange::RegAt { - reg, - cfa_offset: cfa_state.current_depth, - }] - .into_boxed_slice(), - ); + if csrs.iter(FPR).len() != 0 { + let mut fpr_offset = 0; + + // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base + // address. We are well after postopt could run, so load the CSR region base once here, + // instead of hoping that the addr/store will be combined later. + let stack_addr = pos.ins().stack_addr(types::I64, *csr_slot, 0); + + // At this point we won't be using volatile registers for anything except for return + // At this point we have not saved any CSRs yet, and arguments are all in registers. So + // pick a location for `stack_addr` that is neither non-volatile nor an argument: RAX will + // do. + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rax as u16); + + for reg in csrs.iter(FPR) { + // Append param to entry EBB + let csr_arg = pos.func.dfg.append_ebb_param(ebb, types::F64X2); + + // Assign it a location + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); + + let reg_store_inst = pos.ins().store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset); + fpr_offset += types::F64X2.bytes() as i32; + + if let Some(ref mut frame_layout) = pos.func.frame_layout { + let mut cfa_state = cfa_state + .as_mut() + .expect("cfa state exists when recording frame layout"); + cfa_state.current_depth -= types::F64X2.bytes() as isize; + frame_layout.instructions.insert( + reg_store_inst, + vec![FrameLayoutChange::RegAt { + reg, + cfa_offset: cfa_state.current_depth, + }] + .into_boxed_slice(), + ); + } } } @@ -938,6 +953,7 @@ fn insert_common_epilogues( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, + csr_slot: &StackSlot, isa: &dyn TargetIsa, cfa_state: Option, ) { @@ -952,6 +968,7 @@ fn insert_common_epilogues( pos, reg_type, csrs, + csr_slot, isa, is_last, cfa_state.clone(), @@ -969,6 +986,7 @@ fn insert_common_epilogue( pos: &mut EncCursor, reg_type: ir::types::Type, csrs: &RegisterSet, + csr_slot: &StackSlot, isa: &dyn TargetIsa, is_last: bool, mut cfa_state: Option, @@ -1013,25 +1031,35 @@ fn insert_common_epilogue( pos.func.dfg.append_inst_arg(inst, csr_ret); } - for reg in csrs.iter(FPR) { - let value = pos.ins().load( - types::F64X2, - ir::MemFlags::trusted(), - panic!("get RSP value"), - Offset32::new(panic!("value-appropriate offset")), - ); + if csrs.iter(FPR).len() != 0 { + let mut fpr_offset = 0; + + // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base + // address. We are well after postopt could run, so load the CSR region base once here, + // instead of hoping that the addr/store will be combined later. + let stack_addr = pos.ins().stack_addr(types::I64, *csr_slot, 0); + + // At this point we won't be using volatile registers for anything except for return + // registers. Arbitrarily pick RCX as a volatile register that is not used for return + // values. + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rcx as u16); + + for reg in csrs.iter(FPR) { + let value = pos.ins().load(types::F64X2, ir::MemFlags::trusted(), stack_addr, fpr_offset); + fpr_offset += types::F64X2.bytes() as i32; + + if let Some(ref mut cfa_state) = cfa_state.as_mut() { + // Note: don't bother recording a frame layout change because the popped value is + // still correct in memory, and won't be overwritten until we've returned where the + // current frame's layout would no longer matter. Only adjust `current_depth` for a + // consistency check later. + cfa_state.current_depth += types::F64X2.bytes() as isize; + } + pos.prev_inst(); - if let Some(ref mut cfa_state) = cfa_state.as_mut() { - // Note: don't bother recording a frame layout change because the popped value is - // still correct in memory, and won't be overwritten until we've returned where the - // current frame's layout would no longer matter. Only adjust `current_depth` for a - // consistency check later. - cfa_state.current_depth += types::F64X2.bytes() as isize; + pos.func.locations[value] = ir::ValueLoc::Reg(reg); + pos.func.dfg.append_inst_arg(inst, value); } - pos.prev_inst(); - - pos.func.locations[value] = ir::ValueLoc::Reg(reg); - pos.func.dfg.append_inst_arg(inst, value); } if let Some(ref mut frame_layout) = pos.func.frame_layout { diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 54c66f89b7c3..573c95c37dfc 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -35,12 +35,23 @@ block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64): ; check that we preserve xmm6 and above if we're using them locally function %float_callee_saves(f64, f64, f64, f64) windows_fastcall { ebb0(v0: f64, v1: f64, v2: f64, v3: f64): -[-, %xmm6] v4 = fadd v0, v1 ; bin: 66 0f db f3 -;[-, %xmm6] v5 = fadd v4, v2 ; bin: 66 0f db f3 -;[-, %xmm6] v6 = fadd v5, v3 ; bin: 66 0f db f3 +; explicitly use a callee-save register +[-, %xmm6] v4 = fadd v0, v1 +[-, %xmm7] v5 = fadd v0, v1 return } -; check: function %float_callee_saves(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], f64 [%xmm6], i64 fp [%rbp]) -> i64 fp [%rbp] windows_fastcall { +; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall { +; check: ebb0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): +; nextln: x86_push v6 +; nextln: copy_special %rsp -> %rbp +; nextln: v7 = stack_addr.i64 ss0 +; nextln: store notrap aligned v8, v7 +; nextln: store notrap aligned v9, v7+16 +; check: v11 = stack_addr.i64 ss0 +; nextln: [Op2fldDisp8#410,%xmm7] v13 = load.f64x2 notrap aligned v11+16 +; nextln: [Op2fld#410,%xmm6] v12 = load.f64x2 notrap aligned v11 +; nextln: v10 = x86_pop.i64 +; nextln: v10, v12, v13 function %mixed_int_float(i64, f64, i64, f32) windows_fastcall { block0(v0: i64, v1: f64, v2: i64, v3: f32): From 0537418b90eec59721683173bcc75d5cf2c557ce Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 6 Feb 2020 14:24:11 -0800 Subject: [PATCH 04/41] rustfmt --- cranelift/codegen/src/isa/x86/abi.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index ab0056dba4b0..ff6a1320e995 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -857,7 +857,9 @@ fn insert_common_prologue( // Assign it a location pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); - let reg_store_inst = pos.ins().store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset); + let reg_store_inst = + pos.ins() + .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset); fpr_offset += types::F64X2.bytes() as i32; if let Some(ref mut frame_layout) = pos.func.frame_layout { @@ -1045,7 +1047,12 @@ fn insert_common_epilogue( pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rcx as u16); for reg in csrs.iter(FPR) { - let value = pos.ins().load(types::F64X2, ir::MemFlags::trusted(), stack_addr, fpr_offset); + let value = pos.ins().load( + types::F64X2, + ir::MemFlags::trusted(), + stack_addr, + fpr_offset, + ); fpr_offset += types::F64X2.bytes() as i32; if let Some(ref mut cfa_state) = cfa_state.as_mut() { From 6147df3303bb81c256fc99c0ca2f06b8067a5c01 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 6 Feb 2020 14:39:26 -0800 Subject: [PATCH 05/41] i do not understand why these are being rejected now --- .../filetests/filetests/isa/x86/windows_fastcall_x64.clif | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 573c95c37dfc..11d666a5b9da 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -48,8 +48,8 @@ ebb0(v0: f64, v1: f64, v2: f64, v3: f64): ; nextln: store notrap aligned v8, v7 ; nextln: store notrap aligned v9, v7+16 ; check: v11 = stack_addr.i64 ss0 -; nextln: [Op2fldDisp8#410,%xmm7] v13 = load.f64x2 notrap aligned v11+16 -; nextln: [Op2fld#410,%xmm6] v12 = load.f64x2 notrap aligned v11 +; nextln: v13 = load.f64x2 notrap aligned v11+16 +; nextln: v12 = load.f64x2 notrap aligned v11 ; nextln: v10 = x86_pop.i64 ; nextln: v10, v12, v13 From ad96c5a3b8f9e5152f75c755c7b30fa51782a106 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 6 Feb 2020 14:46:09 -0800 Subject: [PATCH 06/41] rcx is used as a return value register, but r11 is free at return --- cranelift/codegen/src/isa/x86/abi.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index ff6a1320e995..483f7b4b3fe0 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -1042,9 +1042,9 @@ fn insert_common_epilogue( let stack_addr = pos.ins().stack_addr(types::I64, *csr_slot, 0); // At this point we won't be using volatile registers for anything except for return - // registers. Arbitrarily pick RCX as a volatile register that is not used for return + // registers. Arbitrarily pick R11 as a volatile register that is not used for return // values. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rcx as u16); + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); for reg in csrs.iter(FPR) { let value = pos.ins().load( From 59d23fa48480fdee99412f1625b4e679ebef774f Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 6 Feb 2020 15:21:08 -0800 Subject: [PATCH 07/41] use rbp instead of r11 to work around missing fstDisp8/fldDisp8 encodings --- cranelift/codegen/src/isa/x86/abi.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 483f7b4b3fe0..1060a145d9a2 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -1042,9 +1042,9 @@ fn insert_common_epilogue( let stack_addr = pos.ins().stack_addr(types::I64, *csr_slot, 0); // At this point we won't be using volatile registers for anything except for return - // registers. Arbitrarily pick R11 as a volatile register that is not used for return - // values. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); + // registers. Arbitrarily pick RBP as it won't hold an interesting value, and we're about + // to restore it at the end of the function anyway. + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rbp as u16); for reg in csrs.iter(FPR) { let value = pos.ins().load( From bee0989c7a510c2701c23373d349feae4fc542bf Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 10 Feb 2020 14:43:14 -0800 Subject: [PATCH 08/41] avoid growing callee-save space when already 16-byte aligned --- cranelift/codegen/src/isa/x86/abi.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 1060a145d9a2..01b7e9127f4e 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -574,7 +574,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C csr_stack_size += (num_fprs * types::F64X2.bytes() as usize) as i32; // Ensure that csr stack space is 16-byte aligned for ideal SIMD value placement. - csr_stack_size = csr_stack_size & !0b1111 + 16; + csr_stack_size = (csr_stack_size + 15) & !0b1111; } // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work @@ -732,7 +732,7 @@ fn insert_common_prologue( total_stack_size += csrs.iter(FPR).len() as i64 * types::F64X2.bytes() as i64; // Ensure that csr stack space is 16-byte aligned for ideal SIMD value placement. - total_stack_size = total_stack_size & !0b1111 + 16; + total_stack_size = (total_stack_size + 15) & !0b1111; } insert_stack_check(pos, total_stack_size, stack_limit_arg); From 8f33538df200e769a34e9a2006d8aa5accdc98dd Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 10 Feb 2020 17:34:20 -0800 Subject: [PATCH 09/41] fix merge confict with EBB rename --- cranelift/codegen/src/isa/x86/abi.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 01b7e9127f4e..c7e5c7bf61bc 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -851,8 +851,8 @@ fn insert_common_prologue( pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rax as u16); for reg in csrs.iter(FPR) { - // Append param to entry EBB - let csr_arg = pos.func.dfg.append_ebb_param(ebb, types::F64X2); + // Append param to entry Block + let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2); // Assign it a location pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); From eefa4fc529f281a8d23da2bed0cc1e50086a6951 Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 10 Feb 2020 17:35:27 -0800 Subject: [PATCH 10/41] tests are real too --- .../filetests/filetests/isa/x86/windows_fastcall_x64.clif | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 11d666a5b9da..25390adc5bc4 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -34,14 +34,14 @@ block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64): ; check that we preserve xmm6 and above if we're using them locally function %float_callee_saves(f64, f64, f64, f64) windows_fastcall { -ebb0(v0: f64, v1: f64, v2: f64, v3: f64): +block0(v0: f64, v1: f64, v2: f64, v3: f64): ; explicitly use a callee-save register [-, %xmm6] v4 = fadd v0, v1 [-, %xmm7] v5 = fadd v0, v1 return } ; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall { -; check: ebb0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): +; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): ; nextln: x86_push v6 ; nextln: copy_special %rsp -> %rbp ; nextln: v7 = stack_addr.i64 ss0 From 2e632bd7be494f840cce09d976d45e3cc6d2f9b0 Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 12 Feb 2020 15:37:48 -0800 Subject: [PATCH 11/41] also record xmm preservation in Windows fastcall unwind info --- cranelift/codegen/src/isa/x86/unwind.rs | 54 ++++++++++++++++--- .../isa/x86/windows_fastcall_x64_unwind.clif | 50 +++++++++++++++-- cranelift/filetests/src/test_unwind.rs | 24 ++++----- 3 files changed, 106 insertions(+), 22 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index de653b89a68f..2810fa9f76ab 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -1,8 +1,8 @@ //! Unwind information for x64 Windows. -use super::registers::{GPR, RU}; +use super::registers::{FPR, GPR, RU}; use crate::binemit::FrameUnwindSink; -use crate::ir::{Function, InstructionData, Opcode}; +use crate::ir::{Function, InstructionData, Opcode, ValueLoc}; use crate::isa::{CallConv, RegUnit, TargetIsa}; use alloc::vec::Vec; use byteorder::{ByteOrder, LittleEndian}; @@ -36,6 +36,7 @@ fn write_u32(sink: &mut dyn FrameUnwindSink, v: u32) { #[derive(Debug, PartialEq, Eq)] enum UnwindCode { PushRegister { offset: u8, reg: RegUnit }, + SaveXmm { offset: u8, reg: RegUnit, stack_offset: u32 }, StackAlloc { offset: u8, size: u32 }, SetFramePointer { offset: u8, sp_offset: u8 }, } @@ -43,10 +44,12 @@ enum UnwindCode { impl UnwindCode { fn emit(&self, sink: &mut dyn FrameUnwindSink) { enum UnwindOperation { - PushNonvolatileRegister, - LargeStackAlloc, - SmallStackAlloc, - SetFramePointer, + PushNonvolatileRegister = 0, + LargeStackAlloc = 1, + SmallStackAlloc = 2, + SetFramePointer = 3, + SaveXmm128 = 8, + SaveXmm128Far = 9, } match self { @@ -58,6 +61,23 @@ impl UnwindCode { | (UnwindOperation::PushNonvolatileRegister as u8), ); } + Self::SaveXmm { offset, reg, stack_offset } => { + write_u8(sink, *offset); + if *stack_offset <= core::u16::MAX as u32 { + write_u8( + sink, + (FPR.index_of(*reg) << 4) as u8 | (UnwindOperation::SaveXmm128 as u8), + ); + write_u16::(sink, *stack_offset as u16); + } else { + write_u8( + sink, + (FPR.index_of(*reg) << 4) as u8 | (UnwindOperation::SaveXmm128Far as u8), + ); + write_u16::(sink, *stack_offset as u16); + write_u16::(sink, (*stack_offset >> 16) as u16); + } + } Self::StackAlloc { offset, size } => { // Stack allocations on Windows must be a multiple of 8 and be at least 1 slot assert!(*size >= 8); @@ -98,6 +118,13 @@ impl UnwindCode { 3 } } + Self::SaveXmm { stack_offset, .. } => { + if *stack_offset <= core::u16::MAX as u32 { + 2 + } else { + 3 + } + }, _ => 1, } } @@ -202,6 +229,21 @@ impl UnwindInfo { _ => {} } } + InstructionData::Store { opcode: Opcode::Store, args: [arg1, arg2], flags, offset } => { + if let ValueLoc::Reg(ru) = func.locations[arg1] { + let offset_int: i32 = offset.into(); + assert!(offset_int >= 0); + assert!(offset_int % 16 == 0); + let scaled_offset = offset_int as u32 / 16; + if FPR.contains(ru) { + unwind_codes.push(UnwindCode::SaveXmm { + offset: unwind_offset, + reg: ru, + stack_offset: scaled_offset, + }); + } + } + } _ => {} }; diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index b146f0ac7696..559171ec0f1e 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -134,6 +134,15 @@ block0(v0: i64, v1: i64): v12 = load.i32 v0+80 v13 = load.i32 v0+88 v14 = load.i32 v0+96 + v15 = load.f64 v0+104 + v16 = load.f64 v0+112 + v17 = load.f64 v0+120 + v18 = load.f64 v0+128 + v19 = load.f64 v0+136 + v20 = load.f64 v0+144 + v21 = load.f64 v0+152 + v22 = load.f64 v0+160 + v23 = load.f64 v0+168 store.i32 v2, v1+0 store.i32 v3, v1+8 store.i32 v4, v1+16 @@ -147,23 +156,56 @@ block0(v0: i64, v1: i64): store.i32 v12, v1+80 store.i32 v13, v1+88 store.i32 v14, v1+96 + store.f64 v15, v1+104 + store.f64 v16, v1+112 + store.f64 v17, v1+120 + store.f64 v18, v1+128 + store.f64 v19, v1+136 + store.f64 v20, v1+144 + store.f64 v21, v1+152 + store.f64 v22, v1+160 + store.f64 v23, v1+168 return } ; sameln: UnwindInfo { ; nextln: version: 1, ; nextln: flags: 0, -; nextln: prologue_size: 19, -; nextln: unwind_code_count_raw: 10, +; nextln: prologue_size: 42, +; nextln: unwind_code_count_raw: 16, ; nextln: frame_register: 5, ; nextln: frame_register_offset: 0, ; nextln: unwind_codes: [ ; nextln: UnwindCode { -; nextln: offset: 19, +; nextln: offset: 42, ; nextln: op: SmallStackAlloc, -; nextln: info: 3, +; nextln: info: 5, ; nextln: value: None, ; nextln: }, ; nextln: UnwindCode { +; nextln: offset: 38, +; nextln: op: SaveXmm128, +; nextln: info: 8, +; nextln: value: U16( +; nextln: 2, +; nextln: ), +; nextln: }, +; nextln: UnwindCode { +; nextln: offset: 32, +; nextln: op: SaveXmm128, +; nextln: info: 7, +; nextln: value: U16( +; nextln: 1, +; nextln: ), +; nextln: }, +; nextln: UnwindCode { +; nextln: offset: 27, +; nextln: op: SaveXmm128, +; nextln: info: 6, +; nextln: value: U16( +; nextln: 0, +; nextln: ), +; nextln: }, +; nextln: UnwindCode { ; nextln: offset: 15, ; nextln: op: PushNonvolatileRegister, ; nextln: info: 15, diff --git a/cranelift/filetests/src/test_unwind.rs b/cranelift/filetests/src/test_unwind.rs index 3db1cbf8299e..e8b2667fc6e9 100644 --- a/cranelift/filetests/src/test_unwind.rs +++ b/cranelift/filetests/src/test_unwind.rs @@ -177,15 +177,15 @@ impl UnwindCode { #[derive(Debug)] enum UnwindOperation { - PushNonvolatileRegister, - LargeStackAlloc, - SmallStackAlloc, - SetFramePointer, - SaveNonvolatileRegister, - SaveNonvolatileRegisterFar, - SaveXmm128, - SaveXmm128Far, - PushMachineFrame, + PushNonvolatileRegister = 0, + LargeStackAlloc = 1, + SmallStackAlloc = 2, + SetFramePointer = 3, + SaveNonvolatileRegister = 4, + SaveNonvolatileRegisterFar = 5, + SaveXmm128 = 8, + SaveXmm128Far = 9, + PushMachineFrame = 10, } impl From for UnwindOperation { @@ -198,9 +198,9 @@ impl From for UnwindOperation { 3 => Self::SetFramePointer, 4 => Self::SaveNonvolatileRegister, 5 => Self::SaveNonvolatileRegisterFar, - 6 => Self::SaveXmm128, - 7 => Self::SaveXmm128Far, - 8 => Self::PushMachineFrame, + 8 => Self::SaveXmm128, + 9 => Self::SaveXmm128Far, + 10 => Self::PushMachineFrame, _ => panic!("unsupported unwind operation"), } } From 3de41fa3798add49b73a7c233f456e8f521744b1 Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 12 Feb 2020 15:38:28 -0800 Subject: [PATCH 12/41] turns out f64x2 tries to use the nonexistent fstDisp8 encoding for f64x2, so use regular f64 instead --- cranelift/codegen/src/isa/x86/abi.rs | 6 +++--- .../filetests/filetests/isa/x86/windows_fastcall_x64.clif | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index c7e5c7bf61bc..ceefd13b151a 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -613,7 +613,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // the low 128 bits here, but we may find that in practice we should preserve all of // YMM6-15 (or even ZMM6-15?) let csr_arg = - ir::AbiParam::special_reg(types::F64X2, ir::ArgumentPurpose::CalleeSaved, fp_csr); + ir::AbiParam::special_reg(types::F64, ir::ArgumentPurpose::CalleeSaved, fp_csr); func.signature.params.push(csr_arg); func.signature.returns.push(csr_arg); } @@ -852,7 +852,7 @@ fn insert_common_prologue( for reg in csrs.iter(FPR) { // Append param to entry Block - let csr_arg = pos.func.dfg.append_block_param(block, types::F64X2); + let csr_arg = pos.func.dfg.append_block_param(block, types::F64); // Assign it a location pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); @@ -1048,7 +1048,7 @@ fn insert_common_epilogue( for reg in csrs.iter(FPR) { let value = pos.ins().load( - types::F64X2, + types::F64, ir::MemFlags::trusted(), stack_addr, fpr_offset, diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 25390adc5bc4..ed2c429ff297 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -40,16 +40,16 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64): [-, %xmm7] v5 = fadd v0, v1 return } -; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7]) -> i64 fp [%rbp], f64x2 csr [%xmm6], f64x2 csr [%xmm7] windows_fastcall { -; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64x2 [%xmm6], v9: f64x2 [%xmm7]): +; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7]) -> i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7] windows_fastcall { +; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64 [%xmm6], v9: f64 [%xmm7]): ; nextln: x86_push v6 ; nextln: copy_special %rsp -> %rbp ; nextln: v7 = stack_addr.i64 ss0 ; nextln: store notrap aligned v8, v7 ; nextln: store notrap aligned v9, v7+16 ; check: v11 = stack_addr.i64 ss0 -; nextln: v13 = load.f64x2 notrap aligned v11+16 -; nextln: v12 = load.f64x2 notrap aligned v11 +; nextln: v13 = load.f64 notrap aligned v11+16 +; nextln: v12 = load.f64 notrap aligned v11 ; nextln: v10 = x86_pop.i64 ; nextln: v10, v12, v13 From aa5d68e420821d1686b1d5cda946ce33a627d985 Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 12 Feb 2020 15:40:56 -0800 Subject: [PATCH 13/41] trusty rustfmt --- cranelift/codegen/src/isa/x86/abi.rs | 9 +++---- cranelift/codegen/src/isa/x86/unwind.rs | 36 ++++++++++++++++++++----- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index ceefd13b151a..c86b4be4de4e 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -1047,12 +1047,9 @@ fn insert_common_epilogue( pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rbp as u16); for reg in csrs.iter(FPR) { - let value = pos.ins().load( - types::F64, - ir::MemFlags::trusted(), - stack_addr, - fpr_offset, - ); + let value = pos + .ins() + .load(types::F64, ir::MemFlags::trusted(), stack_addr, fpr_offset); fpr_offset += types::F64X2.bytes() as i32; if let Some(ref mut cfa_state) = cfa_state.as_mut() { diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index 2810fa9f76ab..136bc04db8e1 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -35,10 +35,23 @@ fn write_u32(sink: &mut dyn FrameUnwindSink, v: u32) { /// Note: the Cranelift x86 ISA RU enum matches the Windows unwind GPR encoding values. #[derive(Debug, PartialEq, Eq)] enum UnwindCode { - PushRegister { offset: u8, reg: RegUnit }, - SaveXmm { offset: u8, reg: RegUnit, stack_offset: u32 }, - StackAlloc { offset: u8, size: u32 }, - SetFramePointer { offset: u8, sp_offset: u8 }, + PushRegister { + offset: u8, + reg: RegUnit, + }, + SaveXmm { + offset: u8, + reg: RegUnit, + stack_offset: u32, + }, + StackAlloc { + offset: u8, + size: u32, + }, + SetFramePointer { + offset: u8, + sp_offset: u8, + }, } impl UnwindCode { @@ -61,7 +74,11 @@ impl UnwindCode { | (UnwindOperation::PushNonvolatileRegister as u8), ); } - Self::SaveXmm { offset, reg, stack_offset } => { + Self::SaveXmm { + offset, + reg, + stack_offset, + } => { write_u8(sink, *offset); if *stack_offset <= core::u16::MAX as u32 { write_u8( @@ -124,7 +141,7 @@ impl UnwindCode { } else { 3 } - }, + } _ => 1, } } @@ -229,7 +246,12 @@ impl UnwindInfo { _ => {} } } - InstructionData::Store { opcode: Opcode::Store, args: [arg1, arg2], flags, offset } => { + InstructionData::Store { + opcode: Opcode::Store, + args: [arg1, _arg2], + flags: _flags, + offset, + } => { if let ValueLoc::Reg(ru) = func.locations[arg1] { let offset_int: i32 = offset.into(); assert!(offset_int >= 0); From 3ce7acffdb2c2ae61e4d82aa3142ebb12a201c4b Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 12 Feb 2020 16:01:01 -0800 Subject: [PATCH 14/41] clarify that Windows only requires the low 128 bits of floating point registers --- cranelift/codegen/src/isa/x86/abi.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index c86b4be4de4e..d3eaf4acbf85 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -608,10 +608,12 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C for fp_csr in csrs.iter(FPR) { // The calling convention described in - // https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only specifically - // discusses XMM6-XMM15, which are 128-bit registers. It may be sufficient to preserve only - // the low 128 bits here, but we may find that in practice we should preserve all of - // YMM6-15 (or even ZMM6-15?) + // https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention only requires + // preserving the low 128 bits of XMM6-XMM15. + // + // TODO: For now, add just an `F64` rather than `F64X2` because `F64X2` would require + // encoding a fstDisp8 with REX bits set, and we currently can't encode that. F64 causes a + // whole XMM register to be preserved anyway. let csr_arg = ir::AbiParam::special_reg(types::F64, ir::ArgumentPurpose::CalleeSaved, fp_csr); func.signature.params.push(csr_arg); From cc99f141c21ff8774239d452f10932b46e15c42b Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 20 Feb 2020 21:19:52 -0800 Subject: [PATCH 15/41] fix FPR preservation overlapping with GPR stack area --- cranelift/codegen/src/isa/x86/abi.rs | 213 ++++++++++-------- .../isa/x86/windows_fastcall_x64.clif | 12 +- .../isa/x86/windows_fastcall_x64_unwind.clif | 44 ++-- 3 files changed, 143 insertions(+), 126 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index d3eaf4acbf85..c1c89f585336 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -568,19 +568,22 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // space for this frame. let word_size = isa.pointer_bytes() as usize; let num_fprs = csrs.iter(FPR).len(); - let mut csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; - - if num_fprs != 0 { - csr_stack_size += (num_fprs * types::F64X2.bytes() as usize) as i32; + let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; - // Ensure that csr stack space is 16-byte aligned for ideal SIMD value placement. - csr_stack_size = (csr_stack_size + 15) & !0b1111; - } + // Only create an FPR stack slot if we're going to save FPRs. + let fpr_slot = if num_fprs > 0 { + Some(func.create_stack_slot(ir::StackSlotData { + kind: ir::StackSlotKind::SpillSlot, + size: (num_fprs * types::F64X2.bytes() as usize) as u32, + offset: None, + })) + } else { + None + }; // TODO: eventually use the 32 bytes (shadow store) as spill slot. This currently doesn't work // since cranelift does not support spill slots before incoming args - - let csr_slot = func.create_stack_slot(ir::StackSlotData { + func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, size: csr_stack_size as u32, offset: Some(-(WIN_SHADOW_STACK_SPACE + csr_stack_size)), @@ -624,7 +627,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); let prologue_cfa_state = - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, &csr_slot, isa); + insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, fpr_slot.as_ref(), isa); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); @@ -633,7 +636,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, - &csr_slot, + fpr_slot.as_ref(), isa, prologue_cfa_state, ); @@ -660,7 +663,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // will adjust the stack pointer to make room for the rest of the required // space for this frame. let csr_stack_size = ((csrs.iter(GPR).len() + 2) * word_size) as i32; - let csr_slot = func.create_stack_slot(ir::StackSlotData { + func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::IncomingArg, size: csr_stack_size as u32, offset: Some(-csr_stack_size), @@ -690,7 +693,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); let prologue_cfa_state = - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, &csr_slot, isa); + insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, None, isa); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); @@ -699,7 +702,7 @@ fn system_v_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C local_stack_size, reg_type, &csrs, - &csr_slot, + None, isa, prologue_cfa_state, ); @@ -714,7 +717,7 @@ fn insert_common_prologue( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, - csr_slot: &StackSlot, + fpr_slot: Option<&StackSlot>, isa: &dyn TargetIsa, ) -> Option { let word_size = isa.pointer_bytes() as isize; @@ -729,13 +732,7 @@ fn insert_common_prologue( // TODO: Check if the function body actually contains a `call` instruction. let mut total_stack_size = (csrs.iter(GPR).len() + 1 + 1) as i64 * word_size as i64; - let num_fpr = csrs.iter(FPR).len(); - if num_fpr > 0 { - total_stack_size += csrs.iter(FPR).len() as i64 * types::F64X2.bytes() as i64; - - // Ensure that csr stack space is 16-byte aligned for ideal SIMD value placement. - total_stack_size = (total_stack_size + 15) & !0b1111; - } + total_stack_size += csrs.iter(FPR).len() as i64 * types::F64X2.bytes() as i64; insert_stack_check(pos, total_stack_size, stack_limit_arg); } @@ -838,49 +835,6 @@ fn insert_common_prologue( } } - if csrs.iter(FPR).len() != 0 { - let mut fpr_offset = 0; - - // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base - // address. We are well after postopt could run, so load the CSR region base once here, - // instead of hoping that the addr/store will be combined later. - let stack_addr = pos.ins().stack_addr(types::I64, *csr_slot, 0); - - // At this point we won't be using volatile registers for anything except for return - // At this point we have not saved any CSRs yet, and arguments are all in registers. So - // pick a location for `stack_addr` that is neither non-volatile nor an argument: RAX will - // do. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rax as u16); - - for reg in csrs.iter(FPR) { - // Append param to entry Block - let csr_arg = pos.func.dfg.append_block_param(block, types::F64); - - // Assign it a location - pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); - - let reg_store_inst = - pos.ins() - .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset); - fpr_offset += types::F64X2.bytes() as i32; - - if let Some(ref mut frame_layout) = pos.func.frame_layout { - let mut cfa_state = cfa_state - .as_mut() - .expect("cfa state exists when recording frame layout"); - cfa_state.current_depth -= types::F64X2.bytes() as isize; - frame_layout.instructions.insert( - reg_store_inst, - vec![FrameLayoutChange::RegAt { - reg, - cfa_offset: cfa_state.current_depth, - }] - .into_boxed_slice(), - ); - } - } - } - // Allocate stack frame storage. if stack_size > 0 { if isa.flags().enable_probestack() && stack_size > (1 << isa.flags().probestack_size_log2()) @@ -925,6 +879,54 @@ fn insert_common_prologue( } } + // Now that RSP is prepared for the function, we can use stack slots: + if csrs.iter(FPR).len() != 0 { + let mut fpr_offset = 0; + + // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base + // address. We are well after postopt could run, so load the CSR region base once here, + // instead of hoping that the addr/store will be combined later. + let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), 0); + + // At this point we won't be using volatile registers for anything except for return + // At this point we have not saved any CSRs yet, and arguments are all in registers. So + // pick a location for `stack_addr` that is neither non-volatile nor an argument: RAX will + // do. + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rax as u16); + + for reg in csrs.iter(FPR) { + // Append param to entry Block + let csr_arg = pos.func.dfg.append_block_param(block, types::F64); + + // Since regalloc has already run, we must assign a location. + pos.func.locations[csr_arg] = ir::ValueLoc::Reg(reg); + + let reg_store_inst = + pos.ins() + .store(ir::MemFlags::trusted(), csr_arg, stack_addr, fpr_offset); + + // If we preserve FPRs, they occur after SP is adjusted, so also fix up the end point + // to this new instruction. + pos.func.prologue_end = Some(reg_store_inst); + fpr_offset += types::F64X2.bytes() as i32; + + if let Some(ref mut frame_layout) = pos.func.frame_layout { + let mut cfa_state = cfa_state + .as_mut() + .expect("cfa state exists when recording frame layout"); + cfa_state.current_depth -= types::F64X2.bytes() as isize; + frame_layout.instructions.insert( + reg_store_inst, + vec![FrameLayoutChange::RegAt { + reg, + cfa_offset: cfa_state.current_depth, + }] + .into_boxed_slice(), + ); + } + } + } + cfa_state } @@ -957,7 +959,7 @@ fn insert_common_epilogues( stack_size: i64, reg_type: ir::types::Type, csrs: &RegisterSet, - csr_slot: &StackSlot, + fpr_slot: Option<&StackSlot>, isa: &dyn TargetIsa, cfa_state: Option, ) { @@ -972,7 +974,7 @@ fn insert_common_epilogues( pos, reg_type, csrs, - csr_slot, + fpr_slot, isa, is_last, cfa_state.clone(), @@ -990,12 +992,54 @@ fn insert_common_epilogue( pos: &mut EncCursor, reg_type: ir::types::Type, csrs: &RegisterSet, - csr_slot: &StackSlot, + fpr_slot: Option<&StackSlot>, isa: &dyn TargetIsa, is_last: bool, mut cfa_state: Option, ) { let word_size = isa.pointer_bytes() as isize; + + // Even though instructions to restore FPRs are inserted first, we have to append them after + // restored GPRs to satisfy parameter order in the return. + let mut restored_fpr_values = alloc::vec::Vec::new(); + + // Restore FPRs before we move RSP and invalidate stack slots. + if csrs.iter(FPR).len() != 0 { + let mut fpr_offset = 0; + + // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base + // address. We are well after postopt could run, so load the CSR region base once here, + // instead of hoping that the addr/store will be combined later. + let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), 0); + + // At this point we won't be using volatile registers for anything except for return + // registers. Arbitrarily pick RBP as it won't hold an interesting value, and we're about + // to restore it at the end of the function anyway. + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rbp as u16); + + for reg in csrs.iter(FPR) { + let value = pos + .ins() + .load(types::F64, ir::MemFlags::trusted(), stack_addr, fpr_offset); + fpr_offset += types::F64X2.bytes() as i32; + + if let Some(ref mut cfa_state) = cfa_state.as_mut() { + // Note: don't bother recording a frame layout change because the popped value is + // still correct in memory, and won't be overwritten until we've returned where the + // current frame's layout would no longer matter. Only adjust `current_depth` for a + // consistency check later. + cfa_state.current_depth += types::F64X2.bytes() as isize; + } + // Unlike GPRs before, we don't need to step back after reach restoration because FPR + // restoration is order-insensitive. Furthermore: we want GPR restoration to begin + // after FPR restoration, so that stack adjustments occur after we're done relying on + // StackSlot validity. + + pos.func.locations[value] = ir::ValueLoc::Reg(reg); + restored_fpr_values.push(value); + } + } + if stack_size > 0 { pos.ins().adjust_sp_up_imm(Imm64::new(stack_size)); } @@ -1035,37 +1079,8 @@ fn insert_common_epilogue( pos.func.dfg.append_inst_arg(inst, csr_ret); } - if csrs.iter(FPR).len() != 0 { - let mut fpr_offset = 0; - - // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base - // address. We are well after postopt could run, so load the CSR region base once here, - // instead of hoping that the addr/store will be combined later. - let stack_addr = pos.ins().stack_addr(types::I64, *csr_slot, 0); - - // At this point we won't be using volatile registers for anything except for return - // registers. Arbitrarily pick RBP as it won't hold an interesting value, and we're about - // to restore it at the end of the function anyway. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rbp as u16); - - for reg in csrs.iter(FPR) { - let value = pos - .ins() - .load(types::F64, ir::MemFlags::trusted(), stack_addr, fpr_offset); - fpr_offset += types::F64X2.bytes() as i32; - - if let Some(ref mut cfa_state) = cfa_state.as_mut() { - // Note: don't bother recording a frame layout change because the popped value is - // still correct in memory, and won't be overwritten until we've returned where the - // current frame's layout would no longer matter. Only adjust `current_depth` for a - // consistency check later. - cfa_state.current_depth += types::F64X2.bytes() as isize; - } - pos.prev_inst(); - - pos.func.locations[value] = ir::ValueLoc::Reg(reg); - pos.func.dfg.append_inst_arg(inst, value); - } + for value in restored_fpr_values.into_iter() { + pos.func.dfg.append_inst_arg(inst, value); } if let Some(ref mut frame_layout) = pos.func.frame_layout { diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index ed2c429ff297..7078f06f2ab9 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -44,14 +44,16 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64): ; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64 [%xmm6], v9: f64 [%xmm7]): ; nextln: x86_push v6 ; nextln: copy_special %rsp -> %rbp +; nextln: adjust_sp_down_imm 64 ; nextln: v7 = stack_addr.i64 ss0 ; nextln: store notrap aligned v8, v7 ; nextln: store notrap aligned v9, v7+16 -; check: v11 = stack_addr.i64 ss0 -; nextln: v13 = load.f64 notrap aligned v11+16 -; nextln: v12 = load.f64 notrap aligned v11 -; nextln: v10 = x86_pop.i64 -; nextln: v10, v12, v13 +; check: v10 = stack_addr.i64 ss0 +; nextln: v11 = load.f64 notrap aligned v10 +; nextln: v12 = load.f64 notrap aligned v10+16 +; nextln: adjust_sp_up_imm 64 +; nextln: v13 = x86_pop.i64 +; nextln: v13, v11, v12 function %mixed_int_float(i64, f64, i64, f32) windows_fastcall { block0(v0: i64, v1: f64, v2: i64, v3: f32): diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index 559171ec0f1e..ee15b2f96599 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -177,33 +177,33 @@ block0(v0: i64, v1: i64): ; nextln: unwind_codes: [ ; nextln: UnwindCode { ; nextln: offset: 42, -; nextln: op: SmallStackAlloc, -; nextln: info: 5, -; nextln: value: None, -; nextln: }, +; nextln: op: SaveXmm128, +; nextln: info: 8, +; nextln: value: U16( +; nextln: 2, +; nextln: ), +; nextln: }, ; nextln: UnwindCode { -; nextln: offset: 38, -; nextln: op: SaveXmm128, -; nextln: info: 8, -; nextln: value: U16( -; nextln: 2, -; nextln: ), +; nextln: offset: 36, +; nextln: op: SaveXmm128, +; nextln: info: 7, +; nextln: value: U16( +; nextln: 1, +; nextln: ), ; nextln: }, ; nextln: UnwindCode { -; nextln: offset: 32, -; nextln: op: SaveXmm128, -; nextln: info: 7, -; nextln: value: U16( -; nextln: 1, -; nextln: ), +; nextln: offset: 31, +; nextln: op: SaveXmm128, +; nextln: info: 6, +; nextln: value: U16( +; nextln: 0, +; nextln: ), ; nextln: }, ; nextln: UnwindCode { -; nextln: offset: 27, -; nextln: op: SaveXmm128, -; nextln: info: 6, -; nextln: value: U16( -; nextln: 0, -; nextln: ), +; nextln: offset: 19, +; nextln: op: SmallStackAlloc, +; nextln: info: 12, +; nextln: value: None, ; nextln: }, ; nextln: UnwindCode { ; nextln: offset: 15, From 597c99459f8f1cc546f0e59d500978ab39ace2f0 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 20 Feb 2020 21:21:49 -0800 Subject: [PATCH 16/41] rustfmt --- cranelift/codegen/src/isa/x86/abi.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index c1c89f585336..6c2cb7bdf916 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -626,8 +626,14 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Set up the cursor and insert the prologue let entry_block = func.layout.entry_block().expect("missing entry block"); let mut pos = EncCursor::new(func, isa).at_first_insertion_point(entry_block); - let prologue_cfa_state = - insert_common_prologue(&mut pos, local_stack_size, reg_type, &csrs, fpr_slot.as_ref(), isa); + let prologue_cfa_state = insert_common_prologue( + &mut pos, + local_stack_size, + reg_type, + &csrs, + fpr_slot.as_ref(), + isa, + ); // Reset the cursor and insert the epilogue let mut pos = pos.at_position(CursorPosition::Nowhere); @@ -886,7 +892,11 @@ fn insert_common_prologue( // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base // address. We are well after postopt could run, so load the CSR region base once here, // instead of hoping that the addr/store will be combined later. - let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), 0); + let stack_addr = pos.ins().stack_addr( + types::I64, + *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), + 0, + ); // At this point we won't be using volatile registers for anything except for return // At this point we have not saved any CSRs yet, and arguments are all in registers. So @@ -1010,7 +1020,11 @@ fn insert_common_epilogue( // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base // address. We are well after postopt could run, so load the CSR region base once here, // instead of hoping that the addr/store will be combined later. - let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), 0); + let stack_addr = pos.ins().stack_addr( + types::I64, + *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), + 0, + ); // At this point we won't be using volatile registers for anything except for return // registers. Arbitrarily pick RBP as it won't hold an interesting value, and we're about From d22ec8a8d5fe25677005b03ea1b3efb4951059fc Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 20 Feb 2020 21:29:14 -0800 Subject: [PATCH 17/41] specify that ss0 should be a 32-byte spill slot, sufficient for two xmm registers --- cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 7078f06f2ab9..6ea7140fd560 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -41,6 +41,8 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64): return } ; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7]) -> i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7] windows_fastcall { +; nextln: ss0 = spill_slot 32, offset -80 +; nextln: ss1 = incoming_arg 16, offset -48 ; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64 [%xmm6], v9: f64 [%xmm7]): ; nextln: x86_push v6 ; nextln: copy_special %rsp -> %rbp From 9489c05c3695bf20e34e0249f46cbf7660b194ce Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 3 Mar 2020 13:39:09 -0800 Subject: [PATCH 18/41] almost fix offset calculations kind of sort of --- cranelift/codegen/src/isa/x86/abi.rs | 16 ++++++---------- cranelift/codegen/src/isa/x86/unwind.rs | 22 +++++++++++++++++----- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 6c2cb7bdf916..ba390bca7da1 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -892,17 +892,14 @@ fn insert_common_prologue( // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base // address. We are well after postopt could run, so load the CSR region base once here, // instead of hoping that the addr/store will be combined later. + // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 let stack_addr = pos.ins().stack_addr( types::I64, *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), 0, ); - // At this point we won't be using volatile registers for anything except for return - // At this point we have not saved any CSRs yet, and arguments are all in registers. So - // pick a location for `stack_addr` that is neither non-volatile nor an argument: RAX will - // do. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rax as u16); + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); for reg in csrs.iter(FPR) { // Append param to entry Block @@ -1017,19 +1014,18 @@ fn insert_common_epilogue( if csrs.iter(FPR).len() != 0 { let mut fpr_offset = 0; - // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base + // `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base // address. We are well after postopt could run, so load the CSR region base once here, // instead of hoping that the addr/store will be combined later. + // + // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 let stack_addr = pos.ins().stack_addr( types::I64, *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), 0, ); - // At this point we won't be using volatile registers for anything except for return - // registers. Arbitrarily pick RBP as it won't hold an interesting value, and we're about - // to restore it at the end of the function anyway. - pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::rbp as u16); + pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); for reg in csrs.iter(FPR) { let value = pos diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index 136bc04db8e1..8a95dabf74b9 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -180,6 +180,11 @@ impl UnwindInfo { let mut unwind_codes = Vec::new(); let mut found_end = false; + // For the time being, FPR preservation is split into a stack_addr and later store/load. + // Store the register used for stack store and ensure it is the same register with no + // intervening changes to the frame size. + let mut frame_start_reg = None; + for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length if (offset + size) > 255 { @@ -194,6 +199,8 @@ impl UnwindInfo { InstructionData::Unary { opcode, arg } => { match opcode { Opcode::X86Push => { + frame_offset += 8; + unwind_codes.push(UnwindCode::PushRegister { offset: unwind_offset, reg: func.locations[arg].unwrap_reg(), @@ -238,6 +245,8 @@ impl UnwindInfo { let imm: i64 = imm.into(); assert!(imm <= core::u32::MAX as i64); + frame_offset += imm as u32; + unwind_codes.push(UnwindCode::StackAlloc { offset: unwind_offset, size: imm as u32, @@ -248,15 +257,18 @@ impl UnwindInfo { } InstructionData::Store { opcode: Opcode::Store, - args: [arg1, _arg2], + args: [arg1, arg2], flags: _flags, offset, } => { - if let ValueLoc::Reg(ru) = func.locations[arg1] { + if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru) = (func.locations[arg1], func.locations[arg2]) { + if base_ru != frame_start_reg { + continue; + } let offset_int: i32 = offset.into(); - assert!(offset_int >= 0); - assert!(offset_int % 16 == 0); - let scaled_offset = offset_int as u32 / 16; + assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error"); + assert!(offset_int % 16 == 0, "xmm preservation offset must be 16-byte aligned, but was {:#x}", offset_int); + let scaled_offset = offset_int / 16; if FPR.contains(ru) { unwind_codes.push(UnwindCode::SaveXmm { offset: unwind_offset, From cc2975a2e3c6989e3ac1ae241bedf0a0b79aa5fb Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 6 Mar 2020 00:36:02 -0800 Subject: [PATCH 19/41] maybe it works now --- cranelift/codegen/src/isa/x86/unwind.rs | 76 ++++++++++++++----- .../isa/x86/windows_fastcall_x64_unwind.clif | 35 +++++++++ 2 files changed, 94 insertions(+), 17 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index 8a95dabf74b9..94deb7e8a831 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -180,11 +180,21 @@ impl UnwindInfo { let mut unwind_codes = Vec::new(); let mut found_end = false; + // At this point we no longer have the stack slot that callee-save registers are placed + // into, so we must re-infer the offset from prologue instructions. + let mut min_callee_save_offset = None; + // In addition to the min offset for a callee-save, we need to know the offset from the + // frame base to the stack pointer, so that we can record an unwind offset that spans only + // to the end of callee-save space. + let mut static_frame_allocation_size = 0u32; + // For the time being, FPR preservation is split into a stack_addr and later store/load. // Store the register used for stack store and ensure it is the same register with no // intervening changes to the frame size. let mut frame_start_reg = None; + let mut preserved_fprs = Vec::new(); + for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length if (offset + size) > 255 { @@ -195,11 +205,13 @@ impl UnwindInfo { let unwind_offset = (offset + size) as u8; + println!("offset {}: {:?}", offset, func.dfg[inst]); + match func.dfg[inst] { InstructionData::Unary { opcode, arg } => { match opcode { Opcode::X86Push => { - frame_offset += 8; + static_frame_allocation_size += 8; unwind_codes.push(UnwindCode::PushRegister { offset: unwind_offset, @@ -221,6 +233,10 @@ impl UnwindInfo { InstructionData::CopySpecial { src, dst, .. } => { if let Some(frame_register) = frame_register { if src == (RU::rsp as RegUnit) && dst == frame_register { + // Constructing an rbp-based stack frame, so the static frame + // allocation restarts at 0 from here. + static_frame_allocation_size = 0; + unwind_codes.push(UnwindCode::SetFramePointer { offset: unwind_offset, sp_offset: 0, @@ -245,7 +261,7 @@ impl UnwindInfo { let imm: i64 = imm.into(); assert!(imm <= core::u32::MAX as i64); - frame_offset += imm as u32; + static_frame_allocation_size += imm as u32; unwind_codes.push(UnwindCode::StackAlloc { offset: unwind_offset, @@ -255,26 +271,34 @@ impl UnwindInfo { _ => {} } } + InstructionData::StackLoad { + opcode: Opcode::StackAddr, + stack_slot: _, + offset: _, + } => { + let result = func.dfg.inst_results(inst).get(0).unwrap(); + if let ValueLoc::Reg(frame_reg) = func.locations[*result] { + frame_start_reg = Some(frame_reg); + } + }, InstructionData::Store { opcode: Opcode::Store, args: [arg1, arg2], flags: _flags, offset, } => { - if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru) = (func.locations[arg1], func.locations[arg2]) { - if base_ru != frame_start_reg { - continue; - } - let offset_int: i32 = offset.into(); - assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error"); - assert!(offset_int % 16 == 0, "xmm preservation offset must be 16-byte aligned, but was {:#x}", offset_int); - let scaled_offset = offset_int / 16; - if FPR.contains(ru) { - unwind_codes.push(UnwindCode::SaveXmm { - offset: unwind_offset, - reg: ru, - stack_offset: scaled_offset, - }); + if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) = (func.locations[arg1], func.locations[arg2]) { + if Some(base_ru) == frame_start_reg { + let offset_int: i32 = offset.into(); + assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error"); + if FPR.contains(ru) { + if let Some(prev_min) = min_callee_save_offset { + min_callee_save_offset = Some(std::cmp::min(prev_min, offset_int as u32)); + } else { + min_callee_save_offset = Some(offset_int as u32); + } + preserved_fprs.push((unwind_offset, ru, offset_int as u32)); + } } } } @@ -291,11 +315,29 @@ impl UnwindInfo { return None; } + println!("HEY THE OFFSET IS {:?}", min_callee_save_offset); + println!("static frame allocation size: {:#x}", static_frame_allocation_size); + if let Some(min_frame_offset) = min_callee_save_offset { + // So we've found the min callee-save offset, and we've found the size of the static + // allocation. Pick a frame register offset so the callee-save region starts at zero + // (eg, is the min offset), then correct the actual offsets to be with respect to the + // selected frame offset. + for (unwind_offset, ru, reg_offset) in preserved_fprs.into_iter() { + unwind_codes.push(UnwindCode::SaveXmm { + offset: unwind_offset, + reg: ru, + stack_offset: reg_offset - min_frame_offset, + }); + } + } else { + assert!(preserved_fprs.len() == 0, "if there is no recorded offset for FPR preservation, there must be no FPRs to preserve"); + } + Some(Self { flags: 0, // this assumes cranelift functions have no SEH handlers prologue_size: prologue_size as u8, frame_register, - frame_register_offset: 0, + frame_register_offset: (min_callee_save_offset.map(|callee_save| static_frame_allocation_size - callee_save).unwrap_or(0) / 16) as u8, unwind_codes, }) } diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index ee15b2f96599..abdad395bfb8 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -118,6 +118,41 @@ block0: ; nextln: ], ; nextln: } +function %fpr_with_function_call(i64, i64) windows_fastcall { + fn0 = %foo(f64, f64, i64, i64, i64) windows_fastcall; +block0(v0: i64, v1: i64): + v2 = load.f64 v0+0 + v3 = load.f64 v0+8 + v4 = load.i64 v0+16 + v15 = load.f64 v0+104 + v16 = load.f64 v0+112 + v17 = load.f64 v0+120 + v18 = load.f64 v0+128 + v19 = load.f64 v0+136 + v20 = load.f64 v0+144 + v21 = load.f64 v0+152 + v22 = load.f64 v0+160 + v23 = load.f64 v0+168 + call fn0(v2, v3, v4, v1, v1) + store.f64 v15, v1+104 + store.f64 v16, v1+112 + store.f64 v17, v1+120 + store.f64 v18, v1+128 + store.f64 v19, v1+136 + store.f64 v20, v1+144 + store.f64 v21, v1+152 + store.f64 v22, v1+160 + store.f64 v23, v1+168 + return +} +; sameln: UnwindInfo { +; nextln: version: 1, +; nextln: flags: 0, +; nextln: prologue_size: 17, +; nextln: unwind_code_count_raw: 5, +; nextln: frame_register: 5, +; nextln: frame_register_offset: 0, + ; check a function that has CSRs function %lots_of_registers(i64, i64) windows_fastcall { block0(v0: i64, v1: i64): From 403ebb2ceb321598b0de7000199c4c0011d69db0 Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 10 Mar 2020 03:18:58 -0700 Subject: [PATCH 20/41] maaaybe get Windows FPR unwind info right? --- cranelift/codegen/src/isa/x86/unwind.rs | 92 +++++++++++-------- .../isa/x86/windows_fastcall_x64.clif | 26 ++++++ .../isa/x86/windows_fastcall_x64_unwind.clif | 23 +++-- 3 files changed, 93 insertions(+), 48 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index 94deb7e8a831..628a5903a020 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -80,19 +80,20 @@ impl UnwindCode { stack_offset, } => { write_u8(sink, *offset); - if *stack_offset <= core::u16::MAX as u32 { + let stack_offset = stack_offset / 16; + if stack_offset <= core::u16::MAX as u32 { write_u8( sink, (FPR.index_of(*reg) << 4) as u8 | (UnwindOperation::SaveXmm128 as u8), ); - write_u16::(sink, *stack_offset as u16); + write_u16::(sink, stack_offset as u16); } else { write_u8( sink, (FPR.index_of(*reg) << 4) as u8 | (UnwindOperation::SaveXmm128Far as u8), ); - write_u16::(sink, *stack_offset as u16); - write_u16::(sink, (*stack_offset >> 16) as u16); + write_u16::(sink, stack_offset as u16); + write_u16::(sink, (stack_offset >> 16) as u16); } } Self::StackAlloc { offset, size } => { @@ -180,9 +181,9 @@ impl UnwindInfo { let mut unwind_codes = Vec::new(); let mut found_end = false; - // At this point we no longer have the stack slot that callee-save registers are placed - // into, so we must re-infer the offset from prologue instructions. - let mut min_callee_save_offset = None; + // Have we saved at least one FPR? if so, we might have to check additional constraints. + let mut saved_fpr = false; + // In addition to the min offset for a callee-save, we need to know the offset from the // frame base to the stack pointer, so that we can record an unwind offset that spans only // to the end of callee-save space. @@ -191,9 +192,10 @@ impl UnwindInfo { // For the time being, FPR preservation is split into a stack_addr and later store/load. // Store the register used for stack store and ensure it is the same register with no // intervening changes to the frame size. - let mut frame_start_reg = None; - - let mut preserved_fprs = Vec::new(); + let mut callee_save_region_reg = None; + // Also record the callee-save region's offset from RSP, because it must be added to FPR + // save offsets to compute an offset from the frame base. + let mut callee_save_offset = None; for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length @@ -205,8 +207,6 @@ impl UnwindInfo { let unwind_offset = (offset + size) as u8; - println!("offset {}: {:?}", offset, func.dfg[inst]); - match func.dfg[inst] { InstructionData::Unary { opcode, arg } => { match opcode { @@ -273,31 +273,43 @@ impl UnwindInfo { } InstructionData::StackLoad { opcode: Opcode::StackAddr, - stack_slot: _, + stack_slot, offset: _, } => { let result = func.dfg.inst_results(inst).get(0).unwrap(); if let ValueLoc::Reg(frame_reg) = func.locations[*result] { - frame_start_reg = Some(frame_reg); + callee_save_region_reg = Some(frame_reg); + + // Figure out the offset in the call frame that `frame_reg` will have. + let frame_size = func.stack_slots.layout_info.expect("func's stack slots have layout info if stack operations exist").frame_size; + // Because we're well after the prologue has been constructed, stack slots + // must have been laid out... + let slot_offset = func.stack_slots[stack_slot].offset.expect("callee-save slot has an offset computed"); + let frame_offset = frame_size as i32 + slot_offset; + + callee_save_offset = Some(frame_offset as u32); } - }, + } InstructionData::Store { opcode: Opcode::Store, args: [arg1, arg2], flags: _flags, offset, } => { - if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) = (func.locations[arg1], func.locations[arg2]) { - if Some(base_ru) == frame_start_reg { + if let (ValueLoc::Reg(ru), ValueLoc::Reg(base_ru)) = + (func.locations[arg1], func.locations[arg2]) + { + if Some(base_ru) == callee_save_region_reg { let offset_int: i32 = offset.into(); assert!(offset_int >= 0, "negative fpr offset would store outside the stack frame, and is almost certainly an error"); + let offset_int: u32 = offset_int as u32 + callee_save_offset.expect("FPR presevation requires an FPR save region, which has some stack offset"); if FPR.contains(ru) { - if let Some(prev_min) = min_callee_save_offset { - min_callee_save_offset = Some(std::cmp::min(prev_min, offset_int as u32)); - } else { - min_callee_save_offset = Some(offset_int as u32); - } - preserved_fprs.push((unwind_offset, ru, offset_int as u32)); + saved_fpr = true; + unwind_codes.push(UnwindCode::SaveXmm { + offset: unwind_offset, + reg: ru, + stack_offset: offset_int, + }); } } } @@ -315,29 +327,29 @@ impl UnwindInfo { return None; } - println!("HEY THE OFFSET IS {:?}", min_callee_save_offset); - println!("static frame allocation size: {:#x}", static_frame_allocation_size); - if let Some(min_frame_offset) = min_callee_save_offset { - // So we've found the min callee-save offset, and we've found the size of the static - // allocation. Pick a frame register offset so the callee-save region starts at zero - // (eg, is the min offset), then correct the actual offsets to be with respect to the - // selected frame offset. - for (unwind_offset, ru, reg_offset) in preserved_fprs.into_iter() { - unwind_codes.push(UnwindCode::SaveXmm { - offset: unwind_offset, - reg: ru, - stack_offset: reg_offset - min_frame_offset, - }); - } - } else { - assert!(preserved_fprs.len() == 0, "if there is no recorded offset for FPR preservation, there must be no FPRs to preserve"); + if static_frame_allocation_size > 240 && saved_fpr { + panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs"); } + assert!(static_frame_allocation_size % 16 == 0, "static frame allocation must be a multiple of 16"); + + // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at + // one end of the call frame, and RSP at the other, required offsets are arbitrarily large. + // Windows x64 SEH only allows this offset be up to 240 bytes, however, meaning large + // frames are inexpressible, and we cannot actually compile the function. In case there are + // no preserved FPRs, we can lie without error and claim the offset to RBP is 0 - nothing + // will actually check it. This, then, avoids panics when compiling functions with large + // call frames. + let reported_frame_offset = if saved_fpr { + (static_frame_allocation_size / 16) as u8 + } else { + 0 + }; Some(Self { flags: 0, // this assumes cranelift functions have no SEH handlers prologue_size: prologue_size as u8, frame_register, - frame_register_offset: (min_callee_save_offset.map(|callee_save| static_frame_allocation_size - callee_save).unwrap_or(0) / 16) as u8, + frame_register_offset: reported_frame_offset, unwind_codes, }) } diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 6ea7140fd560..064d0250a4b8 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -68,3 +68,29 @@ block0(v0: f32, v1: f64, v2: i64, v3: i64): return v1 } ; check: function %ret_val_float(f32 [%xmm0], f64 [%xmm1], i64 [%r8], i64 [%r9], i64 fp [%rbp]) -> f64 [%xmm0], i64 fp [%rbp] windows_fastcall { + +function %internal_stack_arg_function_call(i64) -> i64 windows_fastcall { + fn0 = %foo(i64, i64, i64, i64) -> i64 + fn1 = %foo2(i64, i64, i64, i64) -> i64 +block0(v0: i64): + v1 = load.i64 v0+0 + v2 = load.i64 v0+8 + v3 = load.i64 v0+16 + v4 = load.i64 v0+24 + v5 = load.i64 v0+32 + v6 = load.i64 v0+40 + v7 = load.i64 v0+48 + v8 = load.i64 v0+56 + v9 = load.i64 v0+64 + v10 = call fn0(v1, v2, v3, v4) + store.i64 v1, v0+8 + store.i64 v2, v0+16 + store.i64 v3, v0+24 + store.i64 v4, v0+32 + store.i64 v5, v0+40 + store.i64 v6, v0+48 + store.i64 v7, v0+56 + store.i64 v8, v0+64 + store.i64 v9, v0+72 + return v10 +} diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index abdad395bfb8..601c83e394a9 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -148,10 +148,17 @@ block0(v0: i64, v1: i64): ; sameln: UnwindInfo { ; nextln: version: 1, ; nextln: flags: 0, -; nextln: prologue_size: 17, -; nextln: unwind_code_count_raw: 5, +; nextln: prologue_size: 26, +; nextln: unwind_code_count_raw: 7, ; nextln: frame_register: 5, -; nextln: frame_register_offset: 0, +; nextln: frame_register_offset: 12, +; nextln: unwind_codes: [ +; nextln: UnwindCode { +; nextln: offset: 26, +; nextln: op: SaveXmm128, +; nextln: info: 15, +; nextln: value: U16( +; nextln: 3, ; check a function that has CSRs function %lots_of_registers(i64, i64) windows_fastcall { @@ -205,13 +212,13 @@ block0(v0: i64, v1: i64): ; sameln: UnwindInfo { ; nextln: version: 1, ; nextln: flags: 0, -; nextln: prologue_size: 42, +; nextln: prologue_size: 44, ; nextln: unwind_code_count_raw: 16, ; nextln: frame_register: 5, -; nextln: frame_register_offset: 0, +; nextln: frame_register_offset: 10, ; nextln: unwind_codes: [ ; nextln: UnwindCode { -; nextln: offset: 42, +; nextln: offset: 44, ; nextln: op: SaveXmm128, ; nextln: info: 8, ; nextln: value: U16( @@ -219,7 +226,7 @@ block0(v0: i64, v1: i64): ; nextln: ), ; nextln: }, ; nextln: UnwindCode { -; nextln: offset: 36, +; nextln: offset: 38, ; nextln: op: SaveXmm128, ; nextln: info: 7, ; nextln: value: U16( @@ -227,7 +234,7 @@ block0(v0: i64, v1: i64): ; nextln: ), ; nextln: }, ; nextln: UnwindCode { -; nextln: offset: 31, +; nextln: offset: 32, ; nextln: op: SaveXmm128, ; nextln: info: 6, ; nextln: value: U16( From eeca3f442d1d15567f3a114f1ef655ccb9c4bcb9 Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 10 Mar 2020 13:22:20 -0700 Subject: [PATCH 21/41] running rustfmt in the wrong directory does nothing --- cranelift/codegen/src/isa/x86/unwind.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index 628a5903a020..e975934a7f53 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -281,10 +281,16 @@ impl UnwindInfo { callee_save_region_reg = Some(frame_reg); // Figure out the offset in the call frame that `frame_reg` will have. - let frame_size = func.stack_slots.layout_info.expect("func's stack slots have layout info if stack operations exist").frame_size; + let frame_size = func + .stack_slots + .layout_info + .expect("func's stack slots have layout info if stack operations exist") + .frame_size; // Because we're well after the prologue has been constructed, stack slots // must have been laid out... - let slot_offset = func.stack_slots[stack_slot].offset.expect("callee-save slot has an offset computed"); + let slot_offset = func.stack_slots[stack_slot] + .offset + .expect("callee-save slot has an offset computed"); let frame_offset = frame_size as i32 + slot_offset; callee_save_offset = Some(frame_offset as u32); @@ -330,7 +336,10 @@ impl UnwindInfo { if static_frame_allocation_size > 240 && saved_fpr { panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs"); } - assert!(static_frame_allocation_size % 16 == 0, "static frame allocation must be a multiple of 16"); + assert!( + static_frame_allocation_size % 16 == 0, + "static frame allocation must be a multiple of 16" + ); // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at // one end of the call frame, and RSP at the other, required offsets are arbitrarily large. From b86234b0ac8599b24c97eb353e5c0935421b9b0a Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 10 Mar 2020 14:56:01 -0700 Subject: [PATCH 22/41] ci speak to me of windows woes --- cranelift/codegen/src/isa/x86/unwind.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index e975934a7f53..eb454873ce27 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -207,6 +207,8 @@ impl UnwindInfo { let unwind_offset = (offset + size) as u8; + println!("inst offset {}: {}", offset, func.dfg.display_inst(inst, isa)); + match func.dfg[inst] { InstructionData::Unary { opcode, arg } => { match opcode { @@ -336,10 +338,15 @@ impl UnwindInfo { if static_frame_allocation_size > 240 && saved_fpr { panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs"); } - assert!( - static_frame_allocation_size % 16 == 0, - "static frame allocation must be a multiple of 16" - ); + if static_frame_allocation_size % 16 != 0 { + eprintln!("static frame allocation size: {}", static_frame_allocation_size); + panic!("bad frame size"); + } else { + assert!( + static_frame_allocation_size % 16 == 0, + "static frame allocation must be a multiple of 16" + ); + } // Hack to avoid panicking unnecessarily. Because Cranelift generates prologues with RBP at // one end of the call frame, and RSP at the other, required offsets are arbitrarily large. From eeea0dd5323adb41d83f5f892f5c1af183ab6200 Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 11 Mar 2020 00:56:32 -0700 Subject: [PATCH 23/41] only check for aligned frame allocation when we need it to be aligned --- cranelift/codegen/src/isa/x86/unwind.rs | 27 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index eb454873ce27..0348af9680d2 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -207,7 +207,11 @@ impl UnwindInfo { let unwind_offset = (offset + size) as u8; - println!("inst offset {}: {}", offset, func.dfg.display_inst(inst, isa)); + println!( + "inst offset {}: {}", + offset, + func.dfg.display_inst(inst, isa) + ); match func.dfg[inst] { InstructionData::Unary { opcode, arg } => { @@ -221,12 +225,15 @@ impl UnwindInfo { }); } Opcode::AdjustSpDown => { + let stack_size = + stack_size.expect("expected a previous stack size instruction"); + static_frame_allocation_size += stack_size; + // This is used when calling a stack check function // We need to track the assignment to RAX which has the size of the stack unwind_codes.push(UnwindCode::StackAlloc { offset: unwind_offset, - size: stack_size - .expect("expected a previous stack size instruction"), + size: stack_size, }); } _ => {} @@ -335,13 +342,13 @@ impl UnwindInfo { return None; } - if static_frame_allocation_size > 240 && saved_fpr { - panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs"); - } - if static_frame_allocation_size % 16 != 0 { - eprintln!("static frame allocation size: {}", static_frame_allocation_size); - panic!("bad frame size"); - } else { + if saved_fpr { + if static_frame_allocation_size > 240 && saved_fpr { + panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs"); + } + // Only test static frame size is 16-byte aligned when an FPR is saved to avoid + // panicking when alignment is elided because no FPRs are saved and no child calls are + // made. assert!( static_frame_allocation_size % 16 == 0, "static frame allocation must be a multiple of 16" From 74d4ab832d5d53dc72a036f2a1b134169f40b9fd Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 11 Mar 2020 10:43:45 -0700 Subject: [PATCH 24/41] print the whole prologue that caused issues in unwind another temporary commit. Cranelift::spec::multi_value::func panics but this is not reproducible using clif-util targeting windows? --- cranelift/codegen/src/isa/x86/unwind.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index 0348af9680d2..c226bd7efe84 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -197,6 +197,8 @@ impl UnwindInfo { // save offsets to compute an offset from the frame base. let mut callee_save_offset = None; + let mut prologue_text = alloc::string::String::new(); + for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length if (offset + size) > 255 { @@ -207,11 +209,11 @@ impl UnwindInfo { let unwind_offset = (offset + size) as u8; - println!( - "inst offset {}: {}", + prologue_text.push_str(&format!( + "offset {}: {} ; ", offset, func.dfg.display_inst(inst, isa) - ); + )); match func.dfg[inst] { InstructionData::Unary { opcode, arg } => { @@ -344,7 +346,7 @@ impl UnwindInfo { if saved_fpr { if static_frame_allocation_size > 240 && saved_fpr { - panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs"); + panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs. size is {}, prologue is {}", static_frame_allocation_size, prologue_text); } // Only test static frame size is 16-byte aligned when an FPR is saved to avoid // panicking when alignment is elided because no FPRs are saved and no child calls are From f0f2394ad39e3bd1a85256d43253650ed8980220 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 13 Mar 2020 14:55:40 -0700 Subject: [PATCH 25/41] add a comment explaining the seemingly-trunctaed test --- .../filetests/isa/x86/windows_fastcall_x64_unwind.clif | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif index 601c83e394a9..1119c550128f 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64_unwind.clif @@ -145,6 +145,11 @@ block0(v0: i64, v1: i64): store.f64 v23, v1+168 return } +; Only check the first unwind code here because this test specifically looks to +; see that in a function that is not a leaf, a callee-save FPR is stored in an +; area that does not overlap either the callee's shadow space or stack argument +; space. +; ; sameln: UnwindInfo { ; nextln: version: 1, ; nextln: flags: 0, From 7bcff831cdd71e63a80ed5d9aed6491d84ea7b63 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 13 Mar 2020 15:08:45 -0700 Subject: [PATCH 26/41] Revert "ci speak to me of windows woes" This reverts commit 29c6c40d24775e9775f77cd651b78bc3e71d4c3d. --- cranelift/codegen/src/isa/x86/unwind.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index c226bd7efe84..adc9230054f2 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -197,8 +197,6 @@ impl UnwindInfo { // save offsets to compute an offset from the frame base. let mut callee_save_offset = None; - let mut prologue_text = alloc::string::String::new(); - for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length if (offset + size) > 255 { @@ -209,12 +207,6 @@ impl UnwindInfo { let unwind_offset = (offset + size) as u8; - prologue_text.push_str(&format!( - "offset {}: {} ; ", - offset, - func.dfg.display_inst(inst, isa) - )); - match func.dfg[inst] { InstructionData::Unary { opcode, arg } => { match opcode { @@ -346,7 +338,7 @@ impl UnwindInfo { if saved_fpr { if static_frame_allocation_size > 240 && saved_fpr { - panic!("stack frame is too large to use with Windows x64 SEH when preserving FPRs. size is {}, prologue is {}", static_frame_allocation_size, prologue_text); + panic!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs", static_frame_allocation_size); } // Only test static frame size is 16-byte aligned when an FPR is saved to avoid // panicking when alignment is elided because no FPRs are saved and no child calls are From 56e45cfb7636a8e6fafe53ee0d64e2c5fc5ea2e1 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 13 Mar 2020 15:18:34 -0700 Subject: [PATCH 27/41] preserve FPRs in an ExplicitSlot rather than SpillSlot ExplicitSlot is noted as intended for use with stack_load and stack_store, which are morally equivalent to FPR save/restore --- cranelift/codegen/src/isa/x86/abi.rs | 2 +- cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index ba390bca7da1..a2ce2f3a420f 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -573,7 +573,7 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Only create an FPR stack slot if we're going to save FPRs. let fpr_slot = if num_fprs > 0 { Some(func.create_stack_slot(ir::StackSlotData { - kind: ir::StackSlotKind::SpillSlot, + kind: ir::StackSlotKind::ExplicitSlot, size: (num_fprs * types::F64X2.bytes() as usize) as u32, offset: None, })) diff --git a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif index 064d0250a4b8..917d179ccf74 100644 --- a/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif +++ b/cranelift/filetests/filetests/isa/x86/windows_fastcall_x64.clif @@ -41,7 +41,7 @@ block0(v0: f64, v1: f64, v2: f64, v3: f64): return } ; check: function %float_callee_sav(f64 [%xmm0], f64 [%xmm1], f64 [%xmm2], f64 [%xmm3], i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7]) -> i64 fp [%rbp], f64 csr [%xmm6], f64 csr [%xmm7] windows_fastcall { -; nextln: ss0 = spill_slot 32, offset -80 +; nextln: ss0 = explicit_slot 32, offset -80 ; nextln: ss1 = incoming_arg 16, offset -48 ; check: block0(v0: f64 [%xmm0], v1: f64 [%xmm1], v2: f64 [%xmm2], v3: f64 [%xmm3], v6: i64 [%rbp], v8: f64 [%xmm6], v9: f64 [%xmm7]): ; nextln: x86_push v6 From 84da1245e926b5c51f06e8e277d4f2de2e26318c Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 13 Mar 2020 15:29:53 -0700 Subject: [PATCH 28/41] remove helper function that only existed due to a bad rebase --- cranelift/codegen/src/regalloc/register_set.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cranelift/codegen/src/regalloc/register_set.rs b/cranelift/codegen/src/regalloc/register_set.rs index ff363f505a75..52b8a6fa0a10 100644 --- a/cranelift/codegen/src/regalloc/register_set.rs +++ b/cranelift/codegen/src/regalloc/register_set.rs @@ -46,11 +46,6 @@ impl RegisterSet { Self { avail: [0; 3] } } - /// Returns `true` if all registers in this set are available. - pub fn is_empty(&self) -> bool { - self.avail == [0; 3] - } - /// Returns `true` if the specified register is available. pub fn is_avail(&self, rc: RegClass, reg: RegUnit) -> bool { let (idx, bits) = bitmask(rc, reg); From ceb48bf99d3095f8fe4bc527f9c768823277c9d3 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 13 Mar 2020 16:56:44 -0700 Subject: [PATCH 29/41] temporarily disable the panic-reaching multi-return spectest --- build.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/build.rs b/build.rs index ea6e6c64071b..ff464e3f37f3 100644 --- a/build.rs +++ b/build.rs @@ -194,6 +194,14 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("reference_types", "table_copy_on_imported_tables") => return false, ("reference_types", _) => return true, + ("multi_value", "func") => { + // FIXME This involves a function with very large stack frame that Cranelift currently + // cannot compile using the fastcall (Windows) calling convention. + // See https://github.com/bytecodealliance/wasmtime/pull/1216. + #[cfg(windows)] + return true; + }, + _ => {} }, _ => panic!("unrecognized strategy"), From 291d09b2153f57d51888636f1ef21a53749ee3bb Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 13 Mar 2020 17:03:50 -0700 Subject: [PATCH 30/41] rust ... format ... --- build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.rs b/build.rs index ff464e3f37f3..3893e983f736 100644 --- a/build.rs +++ b/build.rs @@ -200,7 +200,7 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { // See https://github.com/bytecodealliance/wasmtime/pull/1216. #[cfg(windows)] return true; - }, + } _ => {} }, From 0a418a2c9d25bb38dc6416854710c2816e04c5ba Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 2 Apr 2020 12:01:49 -0700 Subject: [PATCH 31/41] ignore more windows-unfriendly tests? --- build.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index 3893e983f736..c6879cb4140c 100644 --- a/build.rs +++ b/build.rs @@ -194,8 +194,11 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("reference_types", "table_copy_on_imported_tables") => return false, ("reference_types", _) => return true, + ("misc", "export_large_signature") | + ("spec", "call") | + ("multi_value", "call") | ("multi_value", "func") => { - // FIXME This involves a function with very large stack frame that Cranelift currently + // FIXME These involves functions with very large stack frames that Cranelift currently // cannot compile using the fastcall (Windows) calling convention. // See https://github.com/bytecodealliance/wasmtime/pull/1216. #[cfg(windows)] From a7e23a9ad400edaa542b6c26d8dbc16dbeddef66 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 2 Apr 2020 13:01:25 -0700 Subject: [PATCH 32/41] try different names for the testsuites? --- build.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.rs b/build.rs index c6879cb4140c..0f0c7de12bcf 100644 --- a/build.rs +++ b/build.rs @@ -194,8 +194,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("reference_types", "table_copy_on_imported_tables") => return false, ("reference_types", _) => return true, - ("misc", "export_large_signature") | - ("spec", "call") | + ("misc_testsuite", "export_large_signature") | + ("spec_testsuite", "call") | ("multi_value", "call") | ("multi_value", "func") => { // FIXME These involves functions with very large stack frames that Cranelift currently From 52425ef9ea390ffaddb86432dc8ba79e21221edf Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 2 Apr 2020 13:01:38 -0700 Subject: [PATCH 33/41] rustfmt --- build.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build.rs b/build.rs index 0f0c7de12bcf..3eeb27c1796f 100644 --- a/build.rs +++ b/build.rs @@ -194,10 +194,10 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { ("reference_types", "table_copy_on_imported_tables") => return false, ("reference_types", _) => return true, - ("misc_testsuite", "export_large_signature") | - ("spec_testsuite", "call") | - ("multi_value", "call") | - ("multi_value", "func") => { + ("misc_testsuite", "export_large_signature") + | ("spec_testsuite", "call") + | ("multi_value", "call") + | ("multi_value", "func") => { // FIXME These involves functions with very large stack frames that Cranelift currently // cannot compile using the fastcall (Windows) calling convention. // See https://github.com/bytecodealliance/wasmtime/pull/1216. From 3060c8cff1c347e2a51aa0a850c28fc20c6ab703 Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 6 Apr 2020 16:43:55 -0700 Subject: [PATCH 34/41] update panic message with reference to implementation limit --- cranelift/codegen/src/isa/x86/unwind.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index adc9230054f2..8574ece7d3c5 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -338,7 +338,10 @@ impl UnwindInfo { if saved_fpr { if static_frame_allocation_size > 240 && saved_fpr { - panic!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs", static_frame_allocation_size); + panic!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \ + This is a Cranelift implementation limit, see \ + https://github.com/bytecodealliance/wasmtime/issues/1475", + static_frame_allocation_size); } // Only test static frame size is 16-byte aligned when an FPR is saved to avoid // panicking when alignment is elided because no FPRs are saved and no child calls are From 42d129b5517a28ed5f2c92f0c45864c6c66e6ec4 Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 7 Apr 2020 03:06:26 -0700 Subject: [PATCH 35/41] review feedback --- cranelift/codegen/src/isa/x86/abi.rs | 36 ++++++++++++++++------------ 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index a2ce2f3a420f..7c61bf40e25d 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -407,7 +407,7 @@ fn callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] // "registers RBX, ... , and XMM6-15 are considered nonvolatile and must be saved // and restored by a function that uses them." // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention as of - // 02-05-2020. + // February 2nd, 2020. &[ RU::xmm6, RU::xmm7, @@ -572,6 +572,12 @@ fn fastcall_prologue_epilogue(func: &mut ir::Function, isa: &dyn TargetIsa) -> C // Only create an FPR stack slot if we're going to save FPRs. let fpr_slot = if num_fprs > 0 { + // Create a stack slot for FPRs to be preserved in. This is an `ExplicitSlot` because it + // seems to most closely map to it as a `StackSlotKind`: FPR preserve/restore should be + // through `stack_load` and `stack_store` (see later comment about issue #1198). Even + // though in a certain light FPR preserve/restore is "spilling" an argument, regalloc + // implies that `SpillSlot` may be eligible for certain optimizations, and we know with + // certainty that this space may not be reused in the function, nor moved around. Some(func.create_stack_slot(ir::StackSlotData { kind: ir::StackSlotKind::ExplicitSlot, size: (num_fprs * types::F64X2.bytes() as usize) as u32, @@ -886,21 +892,21 @@ fn insert_common_prologue( } // Now that RSP is prepared for the function, we can use stack slots: - if csrs.iter(FPR).len() != 0 { - let mut fpr_offset = 0; + if let Some(fpr_slot) = fpr_slot { + debug_assert!(csrs.iter(FPR).len() != 0); // `stack_store` is not directly encodable in x86_64 at the moment, so we'll need a base // address. We are well after postopt could run, so load the CSR region base once here, // instead of hoping that the addr/store will be combined later. // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 - let stack_addr = pos.ins().stack_addr( - types::I64, - *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), - 0, - ); + let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); + // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at + // function entry. pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); + let mut fpr_offset = 0; + for reg in csrs.iter(FPR) { // Append param to entry Block let csr_arg = pos.func.dfg.append_block_param(block, types::F64); @@ -1011,22 +1017,22 @@ fn insert_common_epilogue( let mut restored_fpr_values = alloc::vec::Vec::new(); // Restore FPRs before we move RSP and invalidate stack slots. - if csrs.iter(FPR).len() != 0 { - let mut fpr_offset = 0; + if let Some(fpr_slot) = fpr_slot { + debug_assert!(csrs.iter(FPR).len() != 0); // `stack_load` is not directly encodable in x86_64 at the moment, so we'll need a base // address. We are well after postopt could run, so load the CSR region base once here, // instead of hoping that the addr/store will be combined later. // // See also: https://github.com/bytecodealliance/wasmtime/pull/1198 - let stack_addr = pos.ins().stack_addr( - types::I64, - *fpr_slot.expect("if FPRs are preserved, a stack slot is allocated for them"), - 0, - ); + let stack_addr = pos.ins().stack_addr(types::I64, *fpr_slot, 0); + // Use r11 as fastcall allows it to be clobbered, and it won't have a meaningful value at + // function exit. pos.func.locations[stack_addr] = ir::ValueLoc::Reg(RU::r11 as u16); + let mut fpr_offset = 0; + for reg in csrs.iter(FPR) { let value = pos .ins() From c7547ed67bf2be3ab5833903aeca0a30092c3c2b Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 7 Apr 2020 04:02:43 -0700 Subject: [PATCH 36/41] fix date --- cranelift/codegen/src/isa/x86/abi.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 7c61bf40e25d..bef7584a9370 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -407,7 +407,7 @@ fn callee_saved_fprs(isa: &dyn TargetIsa, call_conv: CallConv) -> &'static [RU] // "registers RBX, ... , and XMM6-15 are considered nonvolatile and must be saved // and restored by a function that uses them." // as per https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention as of - // February 2nd, 2020. + // February 5th, 2020. &[ RU::xmm6, RU::xmm7, From c3530e6170380ab2fdc4dfcc1f9c93a44ffdfbd6 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 9 Apr 2020 12:46:14 -0700 Subject: [PATCH 37/41] try making unwind info fallible --- cranelift/codegen/src/context.rs | 4 ++-- cranelift/codegen/src/isa/mod.rs | 3 ++- cranelift/codegen/src/isa/x86/abi.rs | 9 ++++++--- cranelift/codegen/src/isa/x86/mod.rs | 4 ++-- cranelift/codegen/src/isa/x86/unwind.rs | 23 +++++++++++++++-------- cranelift/filetests/src/test_fde.rs | 2 +- cranelift/filetests/src/test_unwind.rs | 2 +- crates/environ/src/compilation.rs | 2 +- 8 files changed, 30 insertions(+), 19 deletions(-) diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs index ca70293c05fb..967d302ecb9f 100644 --- a/cranelift/codegen/src/context.rs +++ b/cranelift/codegen/src/context.rs @@ -206,8 +206,8 @@ impl Context { isa: &dyn TargetIsa, kind: FrameUnwindKind, sink: &mut dyn FrameUnwindSink, - ) { - isa.emit_unwind_info(&self.func, kind, sink); + ) -> CodegenResult<()> { + isa.emit_unwind_info(&self.func, kind, sink) } /// Run the verifier on the function. diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index 9c91d4219390..643b98b5e208 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -398,7 +398,8 @@ pub trait TargetIsa: fmt::Display + Send + Sync { _func: &ir::Function, _kind: binemit::FrameUnwindKind, _sink: &mut dyn binemit::FrameUnwindSink, - ) { + ) -> CodegenResult<()> { + Ok(()) // No-op by default } } diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index bef7584a9370..3d21f3344d47 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -21,6 +21,7 @@ use crate::ir::{ FrameLayoutChange, InstBuilder, ValueLoc, }; use crate::isa::{CallConv, RegClass, RegUnit, TargetIsa}; +use alloc::vec::Vec; use crate::regalloc::RegisterSet; use crate::result::CodegenResult; use crate::stack_layout::layout_stack; @@ -1014,7 +1015,7 @@ fn insert_common_epilogue( // Even though instructions to restore FPRs are inserted first, we have to append them after // restored GPRs to satisfy parameter order in the return. - let mut restored_fpr_values = alloc::vec::Vec::new(); + let mut restored_fpr_values = Vec::new(); // Restore FPRs before we move RSP and invalidate stack slots. if let Some(fpr_slot) = fpr_slot { @@ -1149,12 +1150,12 @@ pub fn emit_unwind_info( isa: &dyn TargetIsa, kind: FrameUnwindKind, sink: &mut dyn FrameUnwindSink, -) { +) -> CodegenResult<()> { match kind { FrameUnwindKind::Fastcall => { // Assumption: RBP is being used as the frame pointer // In the future, Windows fastcall codegen should usually omit the frame pointer - if let Some(info) = UnwindInfo::try_from_func(func, isa, Some(RU::rbp.into())) { + if let Some(info) = UnwindInfo::try_from_func(func, isa, Some(RU::rbp.into()))? { info.emit(sink); } } @@ -1164,4 +1165,6 @@ pub fn emit_unwind_info( } } } + + Ok(()) } diff --git a/cranelift/codegen/src/isa/x86/mod.rs b/cranelift/codegen/src/isa/x86/mod.rs index 042874ea69e5..881f12bdb1b4 100644 --- a/cranelift/codegen/src/isa/x86/mod.rs +++ b/cranelift/codegen/src/isa/x86/mod.rs @@ -177,8 +177,8 @@ impl TargetIsa for Isa { func: &ir::Function, kind: FrameUnwindKind, sink: &mut dyn FrameUnwindSink, - ) { - abi::emit_unwind_info(func, self, kind, sink); + ) -> CodegenResult<()> { + abi::emit_unwind_info(func, self, kind, sink) } } diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index 8574ece7d3c5..abd405598cab 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -4,8 +4,10 @@ use super::registers::{FPR, GPR, RU}; use crate::binemit::FrameUnwindSink; use crate::ir::{Function, InstructionData, Opcode, ValueLoc}; use crate::isa::{CallConv, RegUnit, TargetIsa}; +use crate::result::{CodegenError, CodegenResult}; use alloc::vec::Vec; use byteorder::{ByteOrder, LittleEndian}; +use log::warn; /// Maximum (inclusive) size of a "small" stack allocation const SMALL_ALLOC_MAX_SIZE: u32 = 128; @@ -166,10 +168,10 @@ impl UnwindInfo { func: &Function, isa: &dyn TargetIsa, frame_register: Option, - ) -> Option { + ) -> CodegenResult> { // Only Windows fastcall is supported for unwind information if func.signature.call_conv != CallConv::WindowsFastcall || func.prologue_end.is_none() { - return None; + return Ok(None); } let prologue_end = func.prologue_end.unwrap(); @@ -200,7 +202,8 @@ impl UnwindInfo { for (offset, inst, size) in func.inst_offsets(entry_block, &isa.encoding_info()) { // x64 ABI prologues cannot exceed 255 bytes in length if (offset + size) > 255 { - panic!("function prologues cannot exceed 255 bytes in size for Windows x64"); + warn!("function prologues cannot exceed 255 bytes in size for Windows x64"); + return Err(CodegenError::CodeTooLarge); } prologue_size += size; @@ -333,15 +336,16 @@ impl UnwindInfo { } if !found_end { - return None; + return Ok(None); } if saved_fpr { if static_frame_allocation_size > 240 && saved_fpr { - panic!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \ + warn!("stack frame is too large ({} bytes) to use with Windows x64 SEH when preserving FPRs. \ This is a Cranelift implementation limit, see \ https://github.com/bytecodealliance/wasmtime/issues/1475", static_frame_allocation_size); + return Err(CodegenError::ImplLimitExceeded); } // Only test static frame size is 16-byte aligned when an FPR is saved to avoid // panicking when alignment is elided because no FPRs are saved and no child calls are @@ -365,13 +369,13 @@ impl UnwindInfo { 0 }; - Some(Self { + Ok(Some(Self { flags: 0, // this assumes cranelift functions have no SEH handlers prologue_size: prologue_size as u8, frame_register, frame_register_offset: reported_frame_offset, unwind_codes, - }) + })) } pub fn size(&self) -> usize { @@ -470,7 +474,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); - assert_eq!(UnwindInfo::try_from_func(&context.func, &*isa, None), None); + assert_eq!(UnwindInfo::try_from_func(&context.func, &*isa, None).expect("can emit unwind info"), None); } #[test] @@ -487,6 +491,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let unwind = UnwindInfo::try_from_func(&context.func, &*isa, Some(RU::rbp.into())) + .expect("can emit unwind info") .expect("expected unwind info"); assert_eq!( @@ -551,6 +556,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let unwind = UnwindInfo::try_from_func(&context.func, &*isa, Some(RU::rbp.into())) + .expect("can emit unwind info") .expect("expected unwind info"); assert_eq!( @@ -615,6 +621,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let unwind = UnwindInfo::try_from_func(&context.func, &*isa, Some(RU::rbp.into())) + .expect("can emit unwind info") .expect("expected unwind info"); assert_eq!( diff --git a/cranelift/filetests/src/test_fde.rs b/cranelift/filetests/src/test_fde.rs index 3e3747fdde2c..8625f44890b7 100644 --- a/cranelift/filetests/src/test_fde.rs +++ b/cranelift/filetests/src/test_fde.rs @@ -64,7 +64,7 @@ impl SubTest for TestUnwind { } let mut sink = SimpleUnwindSink(Vec::new(), 0, Vec::new()); - comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Libunwind, &mut sink); + comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Libunwind, &mut sink).expect("can emit unwind info"); let mut text = String::new(); if sink.0.is_empty() { diff --git a/cranelift/filetests/src/test_unwind.rs b/cranelift/filetests/src/test_unwind.rs index e8b2667fc6e9..2ddd3f0555ff 100644 --- a/cranelift/filetests/src/test_unwind.rs +++ b/cranelift/filetests/src/test_unwind.rs @@ -59,7 +59,7 @@ impl SubTest for TestUnwind { } let mut sink = Sink(Vec::new()); - comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Fastcall, &mut sink); + comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Fastcall, &mut sink).expect("can emit unwind info"); let mut text = String::new(); if sink.0.is_empty() { diff --git a/crates/environ/src/compilation.rs b/crates/environ/src/compilation.rs index 8c7bde36215d..dfd4badf95c7 100644 --- a/crates/environ/src/compilation.rs +++ b/crates/environ/src/compilation.rs @@ -80,7 +80,7 @@ impl CompiledFunctionUnwindInfo { }; let mut sink = Sink(Vec::new(), 0, Vec::new()); - context.emit_unwind_info(isa, kind, &mut sink); + context.emit_unwind_info(isa, kind, &mut sink).expect("can emit unwind info");; let Sink(data, offset, relocs) = sink; if data.is_empty() { From f403c0a57c58ba6abb16ce16e2a61101db8686c2 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 9 Apr 2020 13:25:27 -0700 Subject: [PATCH 38/41] pass errors in generating unwind info back up through CompiledFunctionUnwindInfo::new --- crates/api/src/trampoline/func.rs | 4 +++- crates/environ/src/compilation.rs | 16 +++++++++------- crates/environ/src/cranelift.rs | 5 ++++- crates/jit/src/compiler.rs | 9 ++++++++- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/crates/api/src/trampoline/func.rs b/crates/api/src/trampoline/func.rs index 4d64482deacc..fbfa0d3e811a 100644 --- a/crates/api/src/trampoline/func.rs +++ b/crates/api/src/trampoline/func.rs @@ -188,7 +188,9 @@ fn make_trampoline( .map_err(|error| pretty_error(&context.func, Some(isa), error)) .expect("compile_and_emit"); - let unwind_info = CompiledFunctionUnwindInfo::new(isa, &context); + let unwind_info = CompiledFunctionUnwindInfo::new(isa, &context) + .map_err(|error| pretty_error(&context.func, Some(isa), error)) + .expect("emit unwind info"); code_memory .allocate_for_function(&CompiledFunction { diff --git a/crates/environ/src/compilation.rs b/crates/environ/src/compilation.rs index dfd4badf95c7..e9bdbcc7b380 100644 --- a/crates/environ/src/compilation.rs +++ b/crates/environ/src/compilation.rs @@ -4,7 +4,7 @@ use crate::cache::ModuleCacheDataTupleType; use crate::CacheConfig; use crate::ModuleTranslation; -use cranelift_codegen::{binemit, ir, isa, Context}; +use cranelift_codegen::{binemit, ir, isa, Context, CodegenResult}; use cranelift_entity::PrimaryMap; use cranelift_wasm::{DefinedFuncIndex, FuncIndex, WasmError}; use serde::{Deserialize, Serialize}; @@ -36,7 +36,7 @@ pub enum CompiledFunctionUnwindInfo { impl CompiledFunctionUnwindInfo { /// Constructs unwind info object. - pub fn new(isa: &dyn isa::TargetIsa, context: &Context) -> Self { + pub fn new(isa: &dyn isa::TargetIsa, context: &Context) -> CodegenResult { use cranelift_codegen::binemit::{ FrameUnwindKind, FrameUnwindOffset, FrameUnwindSink, Reloc, }; @@ -75,24 +75,26 @@ impl CompiledFunctionUnwindInfo { CallConv::SystemV | CallConv::Fast | CallConv::Cold => FrameUnwindKind::Libunwind, CallConv::WindowsFastcall => FrameUnwindKind::Fastcall, _ => { - return CompiledFunctionUnwindInfo::None; + return Ok(CompiledFunctionUnwindInfo::None); } }; let mut sink = Sink(Vec::new(), 0, Vec::new()); - context.emit_unwind_info(isa, kind, &mut sink).expect("can emit unwind info");; + context.emit_unwind_info(isa, kind, &mut sink)?; let Sink(data, offset, relocs) = sink; if data.is_empty() { - return CompiledFunctionUnwindInfo::None; + return Ok(CompiledFunctionUnwindInfo::None); } - match kind { + let info = match kind { FrameUnwindKind::Fastcall => CompiledFunctionUnwindInfo::Windows(data), FrameUnwindKind::Libunwind => { CompiledFunctionUnwindInfo::FrameLayout(data, offset, relocs) } - } + }; + + Ok(info) } /// Retuns true is no unwind info data. diff --git a/crates/environ/src/cranelift.rs b/crates/environ/src/cranelift.rs index 061457b1d50f..8ba8e04ab882 100644 --- a/crates/environ/src/cranelift.rs +++ b/crates/environ/src/cranelift.rs @@ -264,7 +264,10 @@ fn compile(env: CompileEnv<'_>) -> Result Date: Thu, 9 Apr 2020 13:34:04 -0700 Subject: [PATCH 39/41] forgot to rustfmt --- cranelift/codegen/src/isa/x86/abi.rs | 2 +- cranelift/codegen/src/isa/x86/unwind.rs | 5 ++++- cranelift/filetests/src/test_fde.rs | 4 +++- cranelift/filetests/src/test_unwind.rs | 4 +++- crates/environ/src/compilation.rs | 2 +- crates/environ/src/cranelift.rs | 7 +++---- crates/jit/src/compiler.rs | 15 +++++++-------- 7 files changed, 22 insertions(+), 17 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 3d21f3344d47..4162f0037535 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -21,11 +21,11 @@ use crate::ir::{ FrameLayoutChange, InstBuilder, ValueLoc, }; use crate::isa::{CallConv, RegClass, RegUnit, TargetIsa}; -use alloc::vec::Vec; use crate::regalloc::RegisterSet; use crate::result::CodegenResult; use crate::stack_layout::layout_stack; use alloc::borrow::Cow; +use alloc::vec::Vec; use core::i32; use std::boxed::Box; use target_lexicon::{PointerWidth, Triple}; diff --git a/cranelift/codegen/src/isa/x86/unwind.rs b/cranelift/codegen/src/isa/x86/unwind.rs index abd405598cab..707b9e6d2c2b 100644 --- a/cranelift/codegen/src/isa/x86/unwind.rs +++ b/cranelift/codegen/src/isa/x86/unwind.rs @@ -474,7 +474,10 @@ mod tests { context.compile(&*isa).expect("expected compilation"); - assert_eq!(UnwindInfo::try_from_func(&context.func, &*isa, None).expect("can emit unwind info"), None); + assert_eq!( + UnwindInfo::try_from_func(&context.func, &*isa, None).expect("can emit unwind info"), + None + ); } #[test] diff --git a/cranelift/filetests/src/test_fde.rs b/cranelift/filetests/src/test_fde.rs index 8625f44890b7..5a9305479bba 100644 --- a/cranelift/filetests/src/test_fde.rs +++ b/cranelift/filetests/src/test_fde.rs @@ -64,7 +64,9 @@ impl SubTest for TestUnwind { } let mut sink = SimpleUnwindSink(Vec::new(), 0, Vec::new()); - comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Libunwind, &mut sink).expect("can emit unwind info"); + comp_ctx + .emit_unwind_info(isa, FrameUnwindKind::Libunwind, &mut sink) + .expect("can emit unwind info"); let mut text = String::new(); if sink.0.is_empty() { diff --git a/cranelift/filetests/src/test_unwind.rs b/cranelift/filetests/src/test_unwind.rs index 2ddd3f0555ff..bf51cb73db0f 100644 --- a/cranelift/filetests/src/test_unwind.rs +++ b/cranelift/filetests/src/test_unwind.rs @@ -59,7 +59,9 @@ impl SubTest for TestUnwind { } let mut sink = Sink(Vec::new()); - comp_ctx.emit_unwind_info(isa, FrameUnwindKind::Fastcall, &mut sink).expect("can emit unwind info"); + comp_ctx + .emit_unwind_info(isa, FrameUnwindKind::Fastcall, &mut sink) + .expect("can emit unwind info"); let mut text = String::new(); if sink.0.is_empty() { diff --git a/crates/environ/src/compilation.rs b/crates/environ/src/compilation.rs index e9bdbcc7b380..82f8786d1792 100644 --- a/crates/environ/src/compilation.rs +++ b/crates/environ/src/compilation.rs @@ -4,7 +4,7 @@ use crate::cache::ModuleCacheDataTupleType; use crate::CacheConfig; use crate::ModuleTranslation; -use cranelift_codegen::{binemit, ir, isa, Context, CodegenResult}; +use cranelift_codegen::{binemit, ir, isa, CodegenResult, Context}; use cranelift_entity::PrimaryMap; use cranelift_wasm::{DefinedFuncIndex, FuncIndex, WasmError}; use serde::{Deserialize, Serialize}; diff --git a/crates/environ/src/cranelift.rs b/crates/environ/src/cranelift.rs index 8ba8e04ab882..7f979c12a5a7 100644 --- a/crates/environ/src/cranelift.rs +++ b/crates/environ/src/cranelift.rs @@ -264,10 +264,9 @@ fn compile(env: CompileEnv<'_>) -> Result Date: Thu, 9 Apr 2020 14:06:10 -0700 Subject: [PATCH 40/41] make emit_fde Result-y as well --- cranelift/codegen/src/isa/x86/abi.rs | 2 +- cranelift/codegen/src/isa/x86/fde.rs | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cranelift/codegen/src/isa/x86/abi.rs b/cranelift/codegen/src/isa/x86/abi.rs index 4162f0037535..c683f101ed2d 100644 --- a/cranelift/codegen/src/isa/x86/abi.rs +++ b/cranelift/codegen/src/isa/x86/abi.rs @@ -1161,7 +1161,7 @@ pub fn emit_unwind_info( } FrameUnwindKind::Libunwind => { if func.frame_layout.is_some() { - emit_fde(func, isa, sink); + emit_fde(func, isa, sink)?; } } } diff --git a/cranelift/codegen/src/isa/x86/fde.rs b/cranelift/codegen/src/isa/x86/fde.rs index 9d6e38de31af..85ed5b5f2a75 100644 --- a/cranelift/codegen/src/isa/x86/fde.rs +++ b/cranelift/codegen/src/isa/x86/fde.rs @@ -4,6 +4,7 @@ use crate::binemit::{FrameUnwindOffset, FrameUnwindSink, Reloc}; use crate::ir::{FrameLayoutChange, Function}; use crate::isa::fde::RegisterMappingError; use crate::isa::{CallConv, RegUnit, TargetIsa}; +use crate::result::CodegenResult; use alloc::vec::Vec; use core::convert::TryInto; use gimli::write::{ @@ -178,7 +179,11 @@ fn to_cfi( } /// Creates FDE structure from FrameLayout. -pub fn emit_fde(func: &Function, isa: &dyn TargetIsa, sink: &mut dyn FrameUnwindSink) { +pub fn emit_fde( + func: &Function, + isa: &dyn TargetIsa, + sink: &mut dyn FrameUnwindSink, +) -> CodegenResult<()> { assert!(isa.name() == "x86"); // Expecting function with System V prologue @@ -266,6 +271,8 @@ pub fn emit_fde(func: &Function, isa: &dyn TargetIsa, sink: &mut dyn FrameUnwind // Need 0 marker for GCC unwind to end FDE "list". sink.bytes(&[0, 0, 0, 0]); + + Ok(()) } #[cfg(test)] @@ -314,7 +321,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let mut sink = SimpleUnwindSink(Vec::new(), 0, Vec::new()); - emit_fde(&context.func, &*isa, &mut sink); + emit_fde(&context.func, &*isa, &mut sink).expect("can emit fde"); assert_eq!( sink.0, @@ -376,7 +383,7 @@ mod tests { context.compile(&*isa).expect("expected compilation"); let mut sink = SimpleUnwindSink(Vec::new(), 0, Vec::new()); - emit_fde(&context.func, &*isa, &mut sink); + emit_fde(&context.func, &*isa, &mut sink).expect("can emit fde"); assert_eq!( sink.0, From 89af7e131c684f6d755b24e7695dae79ae35d931 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 10 Apr 2020 10:02:39 -0700 Subject: [PATCH 41/41] Update cranelift/codegen/src/isa/mod.rs Co-Authored-By: bjorn3 --- cranelift/codegen/src/isa/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index 643b98b5e208..af263c2b5bf0 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -399,8 +399,8 @@ pub trait TargetIsa: fmt::Display + Send + Sync { _kind: binemit::FrameUnwindKind, _sink: &mut dyn binemit::FrameUnwindSink, ) -> CodegenResult<()> { - Ok(()) // No-op by default + Ok(()) } }