Wasmtime: remove indirect-call caching.

In the original development of this feature, guided by JS AOT compilation to Wasm of a microbenchmark heavily focused on IC sites, I was seeing a ~20% speedup. However, in more recent measurements, on full programs (e.g., the Octane benchmark suite), the benefit is more like 5%. Moreover, in bytecodealliance#8870, I attempted to switch over to a direct-mapped cache, to address a current shortcoming of the design, namely that it has a hard-capped number of callsites it can apply to (50k) to limit impact on VMContext struct size. With all of the needed checks for correctness, though, that change results in a 2.5% slowdown relative to no caching at all, so it was dropped. In the process of thinking through that, I discovered the current design on `main` incorrectly handles null funcrefs: it invokes a null code pointer, rather than loading a field from a null struct pointer. The latter was specifically designed to cause the necessary Wasm trap in bytecodealliance#8159, but I had missed that the call to a null code pointer would not have the same effect. As a result, we actually can crash the VM (safely at least, but still no good vs. a proper Wasm trap!) with the feature enabled. (It's off by default still.) That could be fixed too, but at this point with the small benefit on real programs, together with the limitation on module size for full benefit, I think I'd rather opt for simplicity and remove the cache entirely. Thus, this PR removes call-indirect caching. It's not a direct revert because the original PR refactored the call-indirect generation into smaller helpers and IMHO it's a bit nicer to keep that. But otherwise all traces of the setting, code pre-scan during compilation and special conditions tracked on tables, and codegen changes are gone.
cfallin · Jun 27, 2024 · 601a96d · 601a96d
1 parent 7ac3fda
commit 601a96d
Show file tree

Hide file tree

Showing 20 changed files with 23 additions and 1,434 deletions.
diff --git a/crates/cli-flags/src/lib.rs b/crates/cli-flags/src/lib.rs
@@ -115,17 +115,6 @@ wasmtime_option_group! {
         /// The maximum runtime size of each linear memory in the pooling
         /// allocator, in bytes.
         pub pooling_max_memory_size: Option<usize>,
-
-        /// Whether to enable call-indirect caching.
-        pub cache_call_indirects: Option<bool>,
-
-        /// The maximum call-indirect cache slot count.
-        ///
-        /// One slot is allocated per indirect callsite; if the module
-        /// has more indirect callsites than this limit, then the
-        /// first callsites in linear order in the code section, up to
-        /// the limit, will receive a cache slot.
-        pub max_call_indirect_cache_slots: Option<usize>,
     }
 
     enum Optimize {
@@ -576,12 +565,6 @@ impl CommonOptions {
         if let Some(enable) = self.opts.memory_init_cow {
             config.memory_init_cow(enable);
         }
-        if let Some(enable) = self.opts.cache_call_indirects {
-            config.cache_call_indirects(enable);
-        }
-        if let Some(max) = self.opts.max_call_indirect_cache_slots {
-            config.max_call_indirect_cache_slots(max);
-        }
 
         match_feature! {
             ["pooling-allocator" : self.opts.pooling_allocator.or(pooling_allocator_default)]

diff --git a/crates/cranelift/src/compiler.rs b/crates/cranelift/src/compiler.rs
@@ -147,14 +147,8 @@ impl wasmtime_environ::Compiler for Compiler {
             context.func.collect_debug_info();
         }
 
-        let mut func_env = FuncEnvironment::new(
-            isa,
-            translation,
-            types,
-            &self.tunables,
-            self.wmemcheck,
-            input.call_indirect_start,
-        );
+        let mut func_env =
+            FuncEnvironment::new(isa, translation, types, &self.tunables, self.wmemcheck);
 
         // The `stack_limit` global value below is the implementation of stack
         // overflow checks in Wasmtime.
@@ -206,11 +200,7 @@ impl wasmtime_environ::Compiler for Compiler {
             flags: MemFlags::trusted(),
         });
         context.func.stack_limit = Some(stack_limit);
-        let FunctionBodyData {
-            validator,
-            body,
-            call_indirect_start: _,
-        } = input;
+        let FunctionBodyData { validator, body } = input;
         let mut validator =
             validator.into_validator(mem::take(&mut compiler.cx.validator_allocations));
         compiler.cx.func_translator.translate_body(

diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs
@@ -12,9 +12,9 @@ use cranelift_entity::{EntityRef, PrimaryMap, SecondaryMap};
 use cranelift_frontend::FunctionBuilder;
 use cranelift_frontend::Variable;
 use cranelift_wasm::{
-    CallIndirectSiteIndex, EngineOrModuleTypeIndex, FuncIndex, FuncTranslationState, GlobalIndex,
-    GlobalVariable, Heap, HeapData, HeapStyle, MemoryIndex, TableData, TableIndex, TableSize,
-    TargetEnvironment, TypeIndex, WasmHeapTopType, WasmHeapType, WasmResult,
+    EngineOrModuleTypeIndex, FuncIndex, FuncTranslationState, GlobalIndex, GlobalVariable, Heap,
+    HeapData, HeapStyle, MemoryIndex, TableData, TableIndex, TableSize, TargetEnvironment,
+    TypeIndex, WasmHeapTopType, WasmHeapType, WasmResult,
 };
 use std::mem;
 use wasmparser::Operator;
@@ -138,9 +138,6 @@ pub struct FuncEnvironment<'module_environment> {
 
     #[cfg(feature = "wmemcheck")]
     wmemcheck: bool,
-
-    /// The current call-indirect-cache index.
-    pub call_indirect_index: usize,
 }
 
 impl<'module_environment> FuncEnvironment<'module_environment> {
@@ -150,7 +147,6 @@ impl<'module_environment> FuncEnvironment<'module_environment> {
         types: &'module_environment ModuleTypesBuilder,
         tunables: &'module_environment Tunables,
         wmemcheck: bool,
-        call_indirect_start: usize,
     ) -> Self {
         let builtin_functions = BuiltinFunctions::new(isa);
 
@@ -178,8 +174,6 @@ impl<'module_environment> FuncEnvironment<'module_environment> {
             // functions should consume at least some fuel.
             fuel_consumed: 1,
 
-            call_indirect_index: call_indirect_start,
-
             #[cfg(feature = "wmemcheck")]
             wmemcheck,
             #[cfg(feature = "wmemcheck")]
@@ -1030,37 +1024,6 @@ impl<'module_environment> FuncEnvironment<'module_environment> {
             _ => unreachable!(),
         }
     }
-
-    /// Allocate the next CallIndirectSiteIndex for indirect-target
-    /// caching purposes, if slots remain below the slot-count limit.
-    fn alloc_call_indirect_index(&mut self) -> Option<CallIndirectSiteIndex> {
-        // We need to check to see if we have reached the cache-slot
-        // limit.
-        //
-        // There are two kinds of limit behavior:
-        //
-        // 1. Our function's start-index is below the limit, but we
-        //    hit the limit in the middle of the function. We will
-        //    allocate slots up to the limit, then stop exactly when we
-        //    hit it.
-        //
-        // 2. Our function is beyond the limit-count of
-        //    `call_indirect`s. The counting prescan in
-        //    `ModuleEnvironment` that assigns start indices will
-        //    saturate at the limit, and this function's start index
-        //    will be exactly the limit, so we get zero slots and exit
-        //    immediately at every call to this function.
-        if self.call_indirect_index >= self.tunables.max_call_indirect_cache_slots {
-            return None;
-        }
-
-        let idx = CallIndirectSiteIndex::from_u32(
-            u32::try_from(self.call_indirect_index)
-                .expect("More than 2^32 callsites; should be limited by impl limits"),
-        );
-        self.call_indirect_index += 1;
-        Some(idx)
-    }
 }
 
 struct Call<'a, 'func, 'module_env> {
@@ -1172,68 +1135,6 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> {
         Ok(self.indirect_call_inst(sig_ref, func_addr, &real_call_args))
     }
 
-    /// Get the address of the call-indirect cache slot for a given callsite.
-    pub fn call_indirect_cache_slot_addr(
-        &mut self,
-        call_site: CallIndirectSiteIndex,
-        vmctx: ir::Value,
-    ) -> ir::Value {
-        let offset = self.env.offsets.vmctx_call_indirect_cache(call_site);
-        self.builder.ins().iadd_imm(vmctx, i64::from(offset))
-    }
-
-    /// Load the cached index and code pointer for an indirect call.
-    ///
-    /// Generates IR like:
-    ///
-    /// ```ignore
-    /// v1 = load.i64 cache_ptr+0 ;; cached index (cache key)
-    /// v2 = load.i64 cache_ptr+8 ;; cached raw code pointer (cache value)
-    /// ```
-    ///
-    /// and returns `(index, code_ptr)` (e.g. from above, `(v1, v2)`).
-    fn load_cached_indirect_index_and_code_ptr(
-        &mut self,
-        cache_ptr: ir::Value,
-    ) -> (ir::Value, ir::Value) {
-        let cached_index = self.builder.ins().load(
-            I32,
-            MemFlags::trusted(),
-            cache_ptr,
-            Offset32::from(self.env.offsets.ptr.vmcall_indirect_cache_index()),
-        );
-        let cached_code_ptr = self.builder.ins().load(
-            self.env.pointer_type(),
-            MemFlags::trusted(),
-            cache_ptr,
-            Offset32::from(self.env.offsets.ptr.vmcall_indirect_cache_wasm_call()),
-        );
-
-        (cached_index, cached_code_ptr)
-    }
-
-    /// Update the indirect-call cache: store a new index and raw code
-    /// pointer in the slot for a given callsite.
-    fn store_cached_indirect_index_and_code_ptr(
-        &mut self,
-        cache_ptr: ir::Value,
-        index: ir::Value,
-        code_ptr: ir::Value,
-    ) {
-        self.builder.ins().store(
-            MemFlags::trusted(),
-            index,
-            cache_ptr,
-            Offset32::from(self.env.offsets.ptr.vmcall_indirect_cache_index()),
-        );
-        self.builder.ins().store(
-            MemFlags::trusted(),
-            code_ptr,
-            cache_ptr,
-            Offset32::from(self.env.offsets.ptr.vmcall_indirect_cache_wasm_call()),
-        );
-    }
-
     /// Do an indirect call through the given funcref table.
     pub fn indirect_call(
         mut self,
@@ -1243,126 +1144,14 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> {
         callee: ir::Value,
         call_args: &[ir::Value],
     ) -> WasmResult<Option<ir::Inst>> {
-        // If we are performing call-indirect caching with this table, check the cache.
-        let caching = if self.env.tunables.cache_call_indirects {
-            let plan = &self.env.module.table_plans[table_index];
-            // We can do the indirect call caching optimization only
-            // if table elements will not change (no opcodes exist
-            // that could write the table, and table not exported),
-            // and if we can use the zero-index as a sentinenl for "no
-            // cache entry" (initial zeroed vmctx state).
-            !plan.written && !plan.non_null_zero
-        } else {
-            false
-        };
-
-        // Allocate a call-indirect cache slot if caching is
-        // enabled. Note that this still may be `None` if we run out
-        // of slots.
-        let call_site = if caching {
-            self.env.alloc_call_indirect_index()
-        } else {
-            None
-        };
-
-        let (code_ptr, callee_vmctx) = if let Some(call_site) = call_site {
-            // Get a local copy of `vmctx`.
-            let vmctx = self.env.vmctx(self.builder.func);
-            let vmctx = self
-                .builder
-                .ins()
-                .global_value(self.env.pointer_type(), vmctx);
-
-            // Get the address of the cache slot in the VMContext
-            // struct.
-            let slot = self.call_indirect_cache_slot_addr(call_site, vmctx);
-
-            // Create the following CFG and generate code with the following outline:
-            //
-            //   (load cached index and code pointer)
-            //   hit = icmp eq (cached index), (callee)
-            //   brif hit, call_block((cached code ptr), vmctx), miss_block
-            //
-            // miss_block:
-            //   (compute actual code pointer, with checks)
-            //   same_instance = icmp eq (callee vmctx), (vmctx)
-            //   brif same_instance update_block, call_block((actual code ptr), (callee vmctx))
-            //
-            // update_block:
-            //   (store actual index and actual code pointer)
-            //   jump call_block((actual code ptr), (callee vmctx))
-            //
-            // call_block(code_ptr, callee_vmctx):
-            //   (unchecked call-indirect sequence)
-
-            // Create two-level conditionals with CFG.
-            let current_block = self.builder.current_block().unwrap();
-            let miss_block = self.builder.create_block();
-            let call_block = self.builder.create_block();
-            let update_block = self.builder.create_block();
-
-            self.builder.insert_block_after(miss_block, current_block);
-            self.builder.insert_block_after(update_block, miss_block);
-            self.builder.insert_block_after(call_block, update_block);
-            self.builder.set_cold_block(miss_block);
-            self.builder.set_cold_block(update_block);
-
-            // Load cached values, check for hit, branch to
-            // call block or miss block.
-
-            let (cached_index, cached_code_ptr) =
-                self.load_cached_indirect_index_and_code_ptr(slot);
-            let hit = self.builder.ins().icmp(IntCC::Equal, cached_index, callee);
-            self.builder
-                .ins()
-                .brif(hit, call_block, &[cached_code_ptr, vmctx], miss_block, &[]);
-
-            // Miss block: compute actual callee code pointer and
-            // vmctx, and update cache if same-instance.
-
-            self.builder.seal_block(miss_block);
-            self.builder.switch_to_block(miss_block);
-
-            if let Some((code_ptr, callee_vmctx)) =
-                self.check_and_load_code_and_callee_vmctx(table_index, ty_index, callee, true)?
-            {
-                // If callee vmctx is equal to ours, update the cache.
-                let same_instance = self.builder.ins().icmp(IntCC::Equal, callee_vmctx, vmctx);
-
-                self.builder.ins().brif(
-                    same_instance,
-                    update_block,
-                    &[],
-                    call_block,
-                    &[code_ptr, callee_vmctx],
-                );
-
-                self.builder.seal_block(update_block);
-                self.builder.switch_to_block(update_block);
-
-                self.store_cached_indirect_index_and_code_ptr(slot, callee, code_ptr);
-                self.builder
-                    .ins()
-                    .jump(call_block, &[code_ptr, callee_vmctx]);
-            }
-
-            // Call block: do the call.
-
-            self.builder.seal_block(call_block);
-            self.builder.switch_to_block(call_block);
-
-            let code_ptr = self
-                .builder
-                .append_block_param(call_block, self.env.pointer_type());
-            let callee_vmctx = self
-                .builder
-                .append_block_param(call_block, self.env.pointer_type());
-            (code_ptr, callee_vmctx)
-        } else {
-            match self.check_and_load_code_and_callee_vmctx(table_index, ty_index, callee, false)? {
-                Some(pair) => pair,
-                None => return Ok(None),
-            }
+        let (code_ptr, callee_vmctx) = match self.check_and_load_code_and_callee_vmctx(
+            table_index,
+            ty_index,
+            callee,
+            false,
+        )? {
+            Some(pair) => pair,
+            None => return Ok(None),
         };
 
         self.unchecked_call_impl(sig_ref, code_ptr, callee_vmctx, call_args)