Skip to content

Commit

Permalink
Cranelift: Implement tail calls on riscv64 (bytecodealliance#6749)
Browse files Browse the repository at this point in the history
* Cranelift: Implement tail calls on riscv64

Co-Authored-By: Jamey Sharp <jsharp@fastly.com>

* Use existing variable rather than recomputing value

Co-authored-by: Trevor Elliott <awesomelyawesome@gmail.com>

---------

Co-authored-by: Jamey Sharp <jsharp@fastly.com>
Co-authored-by: Trevor Elliott <awesomelyawesome@gmail.com>
  • Loading branch information
3 people authored Jul 24, 2023
1 parent 0f9ac11 commit 1055e28
Show file tree
Hide file tree
Showing 11 changed files with 998 additions and 14 deletions.
39 changes: 39 additions & 0 deletions cranelift/codegen/src/isa/riscv64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,45 @@ impl ABIMachineSpec for Riscv64MachineDeps {
}
}

impl Riscv64ABICallSite {
pub fn emit_return_call(mut self, ctx: &mut Lower<Inst>, args: isle::ValueSlice) {
let (new_stack_arg_size, old_stack_arg_size) =
self.emit_temporary_tail_call_frame(ctx, args);

let dest = self.dest().clone();
let opcode = self.opcode();
let uses = self.take_uses();
let info = Box::new(ReturnCallInfo {
uses,
opcode,
old_stack_arg_size,
new_stack_arg_size,
});

match dest {
// TODO: Our riscv64 backend doesn't have relocs for direct calls,
// the callee is always put in a register and then the register is
// relocated, so we don't currently differentiate between
// `RelocDistance::Near` and `RelocDistance::Far`. We just always
// use indirect calls. We should eventually add a non-indirect
// `return_call` instruction and path.
CallDest::ExtName(name, _) => {
let callee = ctx.alloc_tmp(ir::types::I64).only_reg().unwrap();
ctx.emit(Inst::LoadExtName {
rd: callee,
name: Box::new(name),
offset: 0,
});
ctx.emit(Inst::ReturnCallInd {
callee: callee.to_reg(),
info,
});
}
CallDest::Reg(callee) => ctx.emit(Inst::ReturnCallInd { callee, info }),
}
}
}

const CALLEE_SAVE_X_REG: [bool; 32] = [
false, false, true, false, false, false, false, false, // 0-7
true, true, false, false, false, false, false, false, // 8-15
Expand Down
6 changes: 6 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@
(CallInd
(info BoxCallIndInfo))

;; An indirect return-call macro instruction.
(ReturnCallInd
(callee Reg)
(info BoxReturnCallInfo))

(TrapIf
(test Reg)
(trap_code TrapCode))
Expand Down Expand Up @@ -720,6 +725,7 @@
(type VecBranchTarget (primitive VecBranchTarget))
(type BoxCallInfo (primitive BoxCallInfo))
(type BoxCallIndInfo (primitive BoxCallIndInfo))
(type BoxReturnCallInfo (primitive BoxReturnCallInfo))
(type IntegerCompare (primitive IntegerCompare))
(type AMode (primitive AMode))
(type OptionReg (primitive OptionReg))
Expand Down
172 changes: 169 additions & 3 deletions cranelift/codegen/src/isa/riscv64/inst/emit.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
//! Riscv64 ISA: binary code emission.

use crate::binemit::StackMap;
use crate::ir::RelSourceLoc;
use crate::ir::TrapCode;
use crate::ir::{self, RelSourceLoc, TrapCode};
use crate::isa::riscv64::inst::*;
use crate::isa::riscv64::inst::{zero_reg, AluOPRRR};
use crate::machinst::{AllocationConsumer, Reg, Writable};
use crate::trace;
use cranelift_control::ControlPlane;
Expand Down Expand Up @@ -426,6 +424,7 @@ impl Inst {
| Inst::AdjustSp { .. }
| Inst::Call { .. }
| Inst::CallInd { .. }
| Inst::ReturnCallInd { .. }
| Inst::TrapIf { .. }
| Inst::Jal { .. }
| Inst::CondBr { .. }
Expand Down Expand Up @@ -885,6 +884,27 @@ impl MachInstEmit for Inst {
);
}

&Inst::ReturnCallInd { callee, ref info } => {
let callee = allocs.next(callee);

emit_return_call_common_sequence(
&mut allocs,
sink,
emit_info,
state,
info.new_stack_arg_size,
info.old_stack_arg_size,
&info.uses,
);

Inst::Jalr {
rd: writable_zero_reg(),
base: callee,
offset: Imm12::zero(),
}
.emit(&[], sink, emit_info, state);
}

&Inst::Jal { dest } => {
let code: u32 = 0b1101111;
match dest {
Expand Down Expand Up @@ -3056,3 +3076,149 @@ fn alloc_value_regs(orgin: &ValueRegs<Reg>, alloc: &mut AllocationConsumer) -> V
_ => unreachable!(),
}
}

fn emit_return_call_common_sequence(
allocs: &mut AllocationConsumer<'_>,
sink: &mut MachBuffer<Inst>,
emit_info: &EmitInfo,
state: &mut EmitState,
new_stack_arg_size: u32,
old_stack_arg_size: u32,
uses: &CallArgList,
) {
for u in uses {
let _ = allocs.next(u.vreg);
}

// We are emitting a dynamic number of instructions and might need an
// island. We emit four instructions regardless of how many stack arguments
// we have, and then two instructions per word of stack argument space.
let new_stack_words = new_stack_arg_size / 8;
let insts = 4 + 2 * new_stack_words;
let space_needed = insts * u32::try_from(Inst::INSTRUCTION_SIZE).unwrap();
if sink.island_needed(space_needed) {
let jump_around_label = sink.get_label();
Inst::Jal {
dest: BranchTarget::Label(jump_around_label),
}
.emit(&[], sink, emit_info, state);
sink.emit_island(space_needed + 4, &mut state.ctrl_plane);
sink.bind_label(jump_around_label, &mut state.ctrl_plane);
}

// Copy the new frame on top of our current frame.
//
// The current stack layout is the following:
//
// | ... |
// +---------------------+
// | ... |
// | stack arguments |
// | ... |
// current | return address |
// frame | old FP | <-- FP
// | ... |
// | old stack slots |
// | ... |
// +---------------------+
// | ... |
// new | new stack arguments |
// frame | ... | <-- SP
// +---------------------+
//
// We need to restore the old FP, restore the return address from the stack
// to the link register, copy the new stack arguments over the old stack
// arguments, adjust SP to point to the new stack arguments, and then jump
// to the callee (which will push the old FP and RA again). Note that the
// actual jump happens outside this helper function.

assert_eq!(
new_stack_arg_size % 8,
0,
"size of new stack arguments must be 8-byte aligned"
);

// The delta from our frame pointer to the (eventual) stack pointer value
// when we jump to the tail callee. This is the difference in size of stack
// arguments as well as accounting for the two words we pushed onto the
// stack upon entry to this function (the return address and old frame
// pointer).
let fp_to_callee_sp = i64::from(old_stack_arg_size) - i64::from(new_stack_arg_size) + 16;

let tmp1 = regs::writable_spilltmp_reg();
let tmp2 = regs::writable_spilltmp_reg2();

// Restore the return address to the link register, and load the old FP into
// a temporary register.
//
// We can't put the old FP into the FP register until after we copy the
// stack arguments into place, since that uses address modes that are
// relative to our current FP.
//
// Note that the FP is saved in the function prologue for all non-leaf
// functions, even when `preserve_frame_pointers=false`. Note also that
// `return_call` instructions make it so that a function is considered
// non-leaf. Therefore we always have an FP to restore here.

Inst::gen_load(
writable_link_reg(),
AMode::FPOffset(8, I64),
I64,
MemFlags::trusted(),
)
.emit(&[], sink, emit_info, state);
Inst::gen_load(tmp1, AMode::FPOffset(0, I64), I64, MemFlags::trusted()).emit(
&[],
sink,
emit_info,
state,
);

// Copy the new stack arguments over the old stack arguments.
for i in (0..new_stack_words).rev() {
// Load the `i`th new stack argument word from the temporary stack
// space.
Inst::gen_load(
tmp2,
AMode::SPOffset(i64::from(i * 8), types::I64),
types::I64,
ir::MemFlags::trusted(),
)
.emit(&[], sink, emit_info, state);

// Store it to its final destination on the stack, overwriting our
// current frame.
Inst::gen_store(
AMode::FPOffset(fp_to_callee_sp + i64::from(i * 8), types::I64),
tmp2.to_reg(),
types::I64,
ir::MemFlags::trusted(),
)
.emit(&[], sink, emit_info, state);
}

// Initialize the SP for the tail callee, deallocating the temporary stack
// argument space and our current frame at the same time.
Inst::AluRRImm12 {
alu_op: AluOPRRI::Addi,
rd: regs::writable_stack_reg(),
rs: regs::fp_reg(),
imm12: Imm12::maybe_from_u64(fp_to_callee_sp as u64).unwrap(),
}
.emit(&[], sink, emit_info, state);

// Move the old FP value from the temporary into the FP register.
Inst::Mov {
ty: types::I64,
rd: regs::writable_fp_reg(),
rm: tmp1.to_reg(),
}
.emit(&[], sink, emit_info, state);

state.virtual_sp_offset -= i64::from(new_stack_arg_size);
trace!(
"return_call[_ind] adjusts virtual sp offset by {} -> {}",
new_stack_arg_size,
state.virtual_sp_offset
);
}
32 changes: 32 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRImm5, Ve

type BoxCallInfo = Box<CallInfo>;
type BoxCallIndInfo = Box<CallIndInfo>;
type BoxReturnCallInfo = Box<ReturnCallInfo>;

/// Additional information for (direct) Call instructions, left out of line to lower the size of
/// the Inst enum.
Expand Down Expand Up @@ -91,6 +92,16 @@ pub struct CallIndInfo {
pub callee_pop_size: u32,
}

/// Additional information for `return_call[_ind]` instructions, left out of
/// line to lower the size of the `Inst` enum.
#[derive(Clone, Debug)]
pub struct ReturnCallInfo {
pub uses: CallArgList,
pub opcode: Opcode,
pub old_stack_arg_size: u32,
pub new_stack_arg_size: u32,
}

/// A branch target. Either unresolved (basic-block index) or resolved (offset
/// from end of current instruction).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
Expand Down Expand Up @@ -448,6 +459,12 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
}
collector.reg_clobbers(info.clobbers);
}
&Inst::ReturnCallInd { ref info, callee } => {
collector.reg_use(callee);
for u in &info.uses {
collector.reg_fixed_use(u.vreg, u.preg);
}
}
&Inst::TrapIf { test, .. } => {
collector.reg_use(test);
}
Expand Down Expand Up @@ -863,6 +880,7 @@ impl MachInst for Inst {
&Inst::Jalr { .. } => MachTerminator::Uncond,
&Inst::Ret { .. } => MachTerminator::Ret,
&Inst::BrTable { .. } => MachTerminator::Indirect,
&Inst::ReturnCallInd { .. } => MachTerminator::RetCall,
_ => MachTerminator::None,
}
}
Expand Down Expand Up @@ -1049,6 +1067,7 @@ impl Inst {
}
}

let mut empty_allocs = AllocationConsumer::default();
match self {
&Inst::Nop0 => {
format!("##zero length nop")
Expand Down Expand Up @@ -1583,6 +1602,19 @@ impl Inst {
let rd = format_reg(info.rn, allocs);
format!("callind {}", rd)
}
&MInst::ReturnCallInd { callee, ref info } => {
let callee = format_reg(callee, allocs);
let mut s = format!(
"return_call_ind {callee} old_stack_arg_size:{} new_stack_arg_size:{}",
info.old_stack_arg_size, info.new_stack_arg_size
);
for ret in &info.uses {
let preg = format_reg(ret.preg, &mut empty_allocs);
let vreg = format_reg(ret.vreg, allocs);
write!(&mut s, " {vreg}={preg}").unwrap();
}
s
}
&MInst::TrapIf { test, trap_code } => {
format!("trap_if {},{}", format_reg(test, allocs), trap_code,)
}
Expand Down
9 changes: 9 additions & 0 deletions cranelift/codegen/src/isa/riscv64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1658,6 +1658,15 @@
(rule (lower (call_indirect sig_ref val inputs))
(gen_call_indirect sig_ref val inputs))

;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (return_call (func_ref_data sig_ref extname dist) args))
(gen_return_call sig_ref extname dist args))

(rule (lower (return_call_indirect sig_ref callee args))
(gen_return_call_indirect sig_ref callee args))


;;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (extractlane x @ (value_type ty) (u8_from_uimm8 idx)))
Expand Down
Loading

0 comments on commit 1055e28

Please sign in to comment.