Skip to content

Commit

Permalink
Wasmtime: Add support for Wasm tail calls (#6774)
Browse files Browse the repository at this point in the history
* Wasmtime: Add support for Wasm tail calls

This adds the `Config::wasm_tail_call` method and `--wasm-features tail-call`
CLI flag to enable the Wasm tail calls proposal in Wasmtime.

This PR is mostly just plumbing and enabling tests, since all the prerequisite
work (Wasmtime trampoline overhauls and Cranelift tail calls) was completed in
earlier pull requests.

When Wasm tail calls are enabled, Wasm code uses the `tail` calling
convention. The `tail` calling convention is known to cause a 1-7% slow down for
regular code that isn't using tail calls, which is why it isn't used
unconditionally. This involved shepherding `Tunables` through to Wasm signature
construction methods. The eventual plan is for the `tail` calling convention to
be used unconditionally, but not until the performance regression is
addressed. This work is tracked in
#6759

Additionally while our x86-64, aarch64, and riscv64 backends support tail calls,
the s390x backend does not support them yet. Attempts to use tail calls on s390x
will return errors. Support for s390x is tracked in
#6530

* Store `Tunables` inside the `Compiler`

Instead of passing as an argument to every `Compiler` method.

* Cranelift: Support "direct" return calls on riscv64

They still use `jalr` instead of `jal` but this allows us to use the `RiscvCall`
reloc, which Wasmtime handles. Before we were using `LoadExternalName` which
produces an `Abs8` reloc, which Wasmtime intentionally does not handle since
that involves patching code at runtime, which makes loading code slower.

* Fix tests that assume tail call support on s390x
  • Loading branch information
fitzgen authored Jul 27, 2023
1 parent 5ade0c7 commit 868f0c3
Show file tree
Hide file tree
Showing 32 changed files with 803 additions and 268 deletions.
10 changes: 5 additions & 5 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ fn main() -> anyhow::Result<()> {
test_directory(out, "tests/misc_testsuite", strategy)?;
test_directory_module(out, "tests/misc_testsuite/multi-memory", strategy)?;
test_directory_module(out, "tests/misc_testsuite/simd", strategy)?;
test_directory_module(out, "tests/misc_testsuite/tail-call", strategy)?;
test_directory_module(out, "tests/misc_testsuite/threads", strategy)?;
test_directory_module(out, "tests/misc_testsuite/memory64", strategy)?;
test_directory_module(out, "tests/misc_testsuite/component-model", strategy)?;
Expand Down Expand Up @@ -61,6 +62,7 @@ fn main() -> anyhow::Result<()> {
"tests/spec_testsuite/proposals/relaxed-simd",
strategy,
)?;
test_directory_module(out, "tests/spec_testsuite/proposals/tail-call", strategy)?;
} else {
println!(
"cargo:warning=The spec testsuite is disabled. To enable, run `git submodule \
Expand Down Expand Up @@ -213,11 +215,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
return true;
}

// Tail calls are not yet implemented.
if testname.contains("return_call") {
return true;
}

if testsuite == "function_references" {
// The following tests fail due to function references not yet
// being exposed in the public API.
Expand All @@ -241,6 +238,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
"s390x" => {
// FIXME: These tests fail under qemu due to a qemu bug.
testname == "simd_f32x4_pmin_pmax" || testname == "simd_f64x2_pmin_pmax"
// TODO(#6530): These tests require tail calls, but s390x
// doesn't support them yet.
|| testsuite == "function_references" || testsuite == "tail_call"
}

"riscv64" => {
Expand Down
2 changes: 1 addition & 1 deletion cranelift/codegen/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ impl<'f> FuncCursor<'f> {
}

/// Create an instruction builder that inserts an instruction at the current position.
pub fn ins(&mut self) -> ir::InsertBuilder<&mut FuncCursor<'f>> {
pub fn ins(&mut self) -> ir::InsertBuilder<'_, &mut FuncCursor<'f>> {
ir::InsertBuilder::new(self)
}
}
Expand Down
14 changes: 7 additions & 7 deletions cranelift/codegen/src/isa/riscv64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -702,13 +702,13 @@ impl Riscv64ABICallSite {
});

match dest {
// TODO: Our riscv64 backend doesn't have relocs for direct calls,
// the callee is always put in a register and then the register is
// relocated, so we don't currently differentiate between
// `RelocDistance::Near` and `RelocDistance::Far`. We just always
// use indirect calls. We should eventually add a non-indirect
// `return_call` instruction and path.
CallDest::ExtName(name, _) => {
CallDest::ExtName(name, RelocDistance::Near) => {
ctx.emit(Inst::ReturnCall {
callee: Box::new(name),
info,
});
}
CallDest::ExtName(name, RelocDistance::Far) => {
let callee = ctx.alloc_tmp(ir::types::I64).only_reg().unwrap();
ctx.emit(Inst::LoadExtName {
rd: callee,
Expand Down
5 changes: 5 additions & 0 deletions cranelift/codegen/src/isa/riscv64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@
(CallInd
(info BoxCallIndInfo))

;; A direct return-call macro instruction.
(ReturnCall
(callee BoxExternalName)
(info BoxReturnCallInfo))

;; An indirect return-call macro instruction.
(ReturnCallInd
(callee Reg)
Expand Down
37 changes: 35 additions & 2 deletions cranelift/codegen/src/isa/riscv64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ impl Inst {
| Inst::AdjustSp { .. }
| Inst::Call { .. }
| Inst::CallInd { .. }
| Inst::ReturnCall { .. }
| Inst::ReturnCallInd { .. }
| Inst::TrapIf { .. }
| Inst::Jal { .. }
Expand Down Expand Up @@ -884,6 +885,32 @@ impl MachInstEmit for Inst {
);
}

&Inst::ReturnCall {
ref callee,
ref info,
} => {
emit_return_call_common_sequence(
&mut allocs,
sink,
emit_info,
state,
info.new_stack_arg_size,
info.old_stack_arg_size,
&info.uses,
);

sink.add_call_site(ir::Opcode::ReturnCall);
sink.add_reloc(Reloc::RiscvCall, &callee, 0);
Inst::construct_auipc_and_jalr(None, writable_spilltmp_reg(), 0)
.into_iter()
.for_each(|i| i.emit(&[], sink, emit_info, state));

// `emit_return_call_common_sequence` emits an island if
// necessary, so we can safely disable the worst-case-size check
// in this case.
start_off = sink.cur_offset();
}

&Inst::ReturnCallInd { callee, ref info } => {
let callee = allocs.next(callee);

Expand All @@ -903,6 +930,11 @@ impl MachInstEmit for Inst {
offset: Imm12::zero(),
}
.emit(&[], sink, emit_info, state);

// `emit_return_call_common_sequence` emits an island if
// necessary, so we can safely disable the worst-case-size check
// in this case.
start_off = sink.cur_offset();
}

&Inst::Jal { dest } => {
Expand Down Expand Up @@ -3089,9 +3121,10 @@ fn emit_return_call_common_sequence(

// We are emitting a dynamic number of instructions and might need an
// island. We emit four instructions regardless of how many stack arguments
// we have, and then two instructions per word of stack argument space.
// we have, up to two instructions for the actual call, and then two
// instructions per word of stack argument space.
let new_stack_words = new_stack_arg_size / 8;
let insts = 4 + 2 * new_stack_words;
let insts = 4 + 2 + 2 * new_stack_words;
let space_needed = insts * u32::try_from(Inst::INSTRUCTION_SIZE).unwrap();
if sink.island_needed(space_needed) {
let jump_around_label = sink.get_label();
Expand Down
25 changes: 24 additions & 1 deletion cranelift/codegen/src/isa/riscv64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,14 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
}
collector.reg_clobbers(info.clobbers);
}
&Inst::ReturnCall {
callee: _,
ref info,
} => {
for u in &info.uses {
collector.reg_fixed_use(u.vreg, u.preg);
}
}
&Inst::ReturnCallInd { ref info, callee } => {
collector.reg_use(callee);
for u in &info.uses {
Expand Down Expand Up @@ -880,7 +888,7 @@ impl MachInst for Inst {
&Inst::Jalr { .. } => MachTerminator::Uncond,
&Inst::Ret { .. } => MachTerminator::Ret,
&Inst::BrTable { .. } => MachTerminator::Indirect,
&Inst::ReturnCallInd { .. } => MachTerminator::RetCall,
&Inst::ReturnCall { .. } | &Inst::ReturnCallInd { .. } => MachTerminator::RetCall,
_ => MachTerminator::None,
}
}
Expand Down Expand Up @@ -1602,6 +1610,21 @@ impl Inst {
let rd = format_reg(info.rn, allocs);
format!("callind {}", rd)
}
&MInst::ReturnCall {
ref callee,
ref info,
} => {
let mut s = format!(
"return_call {callee:?} old_stack_arg_size:{} new_stack_arg_size:{}",
info.old_stack_arg_size, info.new_stack_arg_size
);
for ret in &info.uses {
let preg = format_reg(ret.preg, &mut empty_allocs);
let vreg = format_reg(ret.vreg, allocs);
write!(&mut s, " {vreg}={preg}").unwrap();
}
s
}
&MInst::ReturnCallInd { callee, ref info } => {
let callee = format_reg(callee, allocs);
let mut s = format!(
Expand Down
3 changes: 3 additions & 0 deletions cranelift/codegen/src/machinst/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1781,6 +1781,9 @@ impl<I: VCodeInst> TextSectionBuilder for MachTextSectionBuilder<I> {
}

fn resolve_reloc(&mut self, offset: u64, reloc: Reloc, addend: Addend, target: usize) -> bool {
crate::trace!(
"Resolving relocation @ {offset:#x} + {addend:#x} to target {target} of kind {reloc:?}"
);
let label = MachLabel::from_block(BlockIndex::new(target));
let offset = u32::try_from(offset).unwrap();
match I::LabelUse::from_reloc(reloc, addend) {
Expand Down
11 changes: 3 additions & 8 deletions cranelift/filetests/filetests/isa/riscv64/return-call.clif
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ block0(v0: i64):
; sd fp,0(sp)
; mv fp,sp
; block0:
; load_sym t2,%callee_i64+0
; return_call_ind t2 old_stack_arg_size:0 new_stack_arg_size:0 s1=s1
; return_call TestCase(%callee_i64) old_stack_arg_size:0 new_stack_arg_size:0 s1=s1
;
; Disassembled:
; block0: ; offset 0x0
Expand All @@ -79,16 +78,12 @@ block0(v0: i64):
; sd s0, 0(sp)
; ori s0, sp, 0
; block1: ; offset 0x10
; auipc t2, 0
; ld t2, 0xc(t2)
; j 0xc
; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %callee_i64 0
; .byte 0x00, 0x00, 0x00, 0x00
; ld ra, 8(s0)
; ld t6, 0(s0)
; addi sp, s0, 0x10
; ori s0, t6, 0
; jr t2
; auipc t6, 0 ; reloc_external RiscvCall %callee_i64 0
; jr t6

;;;; Test passing `f64`s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

Expand Down
11 changes: 11 additions & 0 deletions cranelift/filetests/src/test_wasm/env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,17 @@ impl<'a> FuncEnvironment for FuncEnv<'a> {
)
}

fn translate_return_call_ref(
&mut self,
builder: &mut cranelift_frontend::FunctionBuilder,
sig_ref: ir::SigRef,
callee: ir::Value,
call_args: &[ir::Value],
) -> cranelift_wasm::WasmResult<()> {
self.inner
.translate_return_call_ref(builder, sig_ref, callee, call_args)
}

fn translate_memory_grow(
&mut self,
pos: cranelift_codegen::cursor::FuncCursor,
Expand Down
2 changes: 1 addition & 1 deletion cranelift/frontend/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ use hashbrown::HashMap;
#[cfg(feature = "std")]
use std::collections::HashMap;

pub use crate::frontend::{FunctionBuilder, FunctionBuilderContext};
pub use crate::frontend::{FuncInstBuilder, FunctionBuilder, FunctionBuilderContext};
pub use crate::switch::Switch;
pub use crate::variable::Variable;

Expand Down
24 changes: 17 additions & 7 deletions cranelift/wasm/src/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
);

let call = environ.translate_call(
builder.cursor(),
builder,
FuncIndex::from_u32(*function_index),
fref,
args,
Expand Down Expand Up @@ -694,7 +694,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
);

environ.translate_return_call(
builder.cursor(),
builder,
FuncIndex::from_u32(*function_index),
fref,
args,
Expand Down Expand Up @@ -731,11 +731,21 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
state.popn(num_args);
state.reachable = false;
}
Operator::ReturnCallRef { type_index: _ } => {
return Err(wasm_unsupported!(
"proposed tail-call operator for function references {:?}",
op
));
Operator::ReturnCallRef { type_index } => {
// Get function signature
// `index` is the index of the function's signature and `table_index` is the index of
// the table to search the function in.
let (sigref, num_args) = state.get_indirect_sig(builder.func, *type_index, environ)?;
let callee = state.pop1();

// Bitcast any vector arguments to their default type, I8X16, before calling.
let args = state.peekn_mut(num_args);
bitcast_wasm_params(environ, sigref, args, builder);

environ.translate_return_call_ref(builder, sigref, callee, state.peekn(num_args))?;

state.popn(num_args);
state.reachable = false;
}
/******************************* Memory management ***********************************
* Memory management is handled by environment. It is usually translated into calls to
Expand Down
23 changes: 18 additions & 5 deletions cranelift/wasm/src/environ/dummy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -457,26 +457,39 @@ impl<'dummy_environment> FuncEnvironment for DummyFuncEnvironment<'dummy_environ
unimplemented!()
}

fn translate_return_call_ref(
&mut self,
_builder: &mut FunctionBuilder,
_sig_ref: ir::SigRef,
_callee: ir::Value,
_call_args: &[ir::Value],
) -> WasmResult<()> {
unimplemented!()
}

fn translate_call(
&mut self,
mut pos: FuncCursor,
builder: &mut FunctionBuilder,
_callee_index: FuncIndex,
callee: ir::FuncRef,
call_args: &[ir::Value],
) -> WasmResult<ir::Inst> {
// Pass the current function's vmctx parameter on to the callee.
let vmctx = pos
let vmctx = builder
.func
.special_param(ir::ArgumentPurpose::VMContext)
.expect("Missing vmctx parameter");

// Build a value list for the call instruction containing the call_args and the vmctx
// parameter.
let mut args = ir::ValueList::default();
args.extend(call_args.iter().cloned(), &mut pos.func.dfg.value_lists);
args.push(vmctx, &mut pos.func.dfg.value_lists);
args.extend(call_args.iter().cloned(), &mut builder.func.dfg.value_lists);
args.push(vmctx, &mut builder.func.dfg.value_lists);

Ok(pos.ins().Call(ir::Opcode::Call, INVALID, callee, args).0)
Ok(builder
.ins()
.Call(ir::Opcode::Call, INVALID, callee, args)
.0)
}

fn translate_call_ref(
Expand Down
Loading

0 comments on commit 868f0c3

Please sign in to comment.