From fb26c56ae0c34c3c09cb3d40ea3484355c67bd84 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 6 Sep 2023 16:19:15 -0500 Subject: [PATCH] Optimize `list` lifting and lowering (#6971) This commit optimizes the lifting and lowering routines for `list` and other similar primitive integer types. The current lifting/lowering is intended to be general-purpose and correct but doesn't optimize well with LLVM for a number of reasons. I first attempted to reshape the general-purpose code to be easier for LLVM to optimize but in the end was unable to convince LLVM that various pointers here don't alias which meant that the general-purpose lowering/lifting never optimized well. Instead, however, I've added new trait methods which are implemented the same way as the general purpose methods beforehand. The integer/primitive implementations overwrite these implementations with more specialized versions given knowledge of primitives. On a local benchmark this makes lifting/lowering disappear from a profile since memcpy is generally much faster than per-item processing. --- crates/wasmtime/src/component/func/typed.rs | 103 ++++++++++++++++++-- 1 file changed, 95 insertions(+), 8 deletions(-) diff --git a/crates/wasmtime/src/component/func/typed.rs b/crates/wasmtime/src/component/func/typed.rs index 93b74c138934..3830b025b2a6 100644 --- a/crates/wasmtime/src/component/func/typed.rs +++ b/crates/wasmtime/src/component/func/typed.rs @@ -507,6 +507,31 @@ pub unsafe trait Lower: ComponentType { ty: InterfaceType, offset: usize, ) -> Result<()>; + + /// Provided method to lower a list of `Self` into memory. + /// + /// Requires that `offset` has already been checked for alignment and + /// validity in terms of being in-bounds, otherwise this may panic. + /// + /// This is primarily here to get overridden for implementations of integers + /// which can avoid some extra fluff and use a pattern that's more easily + /// optimizable by LLVM. + #[doc(hidden)] + fn store_list( + cx: &mut LowerContext<'_, T>, + ty: InterfaceType, + mut offset: usize, + items: &[Self], + ) -> Result<()> + where + Self: Sized, + { + for item in items { + item.store(cx, ty, offset)?; + offset += Self::SIZE32; + } + Ok(()) + } } /// Host types which can be created from the canonical ABI. @@ -547,6 +572,21 @@ pub unsafe trait Lift: Sized + ComponentType { /// for `Op::Lift` this needs to be overridden. #[doc(hidden)] fn load(cx: &mut LiftContext<'_>, ty: InterfaceType, bytes: &[u8]) -> Result; + + /// Converts `list` into a `Vec`, used in `Lift for Vec`. + /// + /// This is primarily here to get overridden for implementations of integers + /// which can avoid some extra fluff and use a pattern that's more easily + /// optimizable by LLVM. + #[doc(hidden)] + fn load_list(cx: &mut LiftContext<'_>, list: &WasmList) -> Result> + where + Self: Sized, + { + (0..list.len) + .map(|index| list.get_from_store(cx, index).unwrap()) + .collect() + } } // Macro to help generate "forwarding implementations" of `ComponentType` to @@ -636,12 +676,12 @@ macro_rules! forward_list_lifts { unsafe impl Lift for $a { fn lift(cx: &mut LiftContext<'_>, ty: InterfaceType, src: &Self::Lower) -> Result { let list = as Lift>::lift(cx, ty, src)?; - (0..list.len).map(|index| list.get_from_store(cx, index).unwrap()).collect() + Ok(T::load_list(cx, &list)?.into()) } fn load(cx: &mut LiftContext<'_>, ty: InterfaceType, bytes: &[u8]) -> Result { let list = as Lift>::load(cx, ty, bytes)?; - (0..list.len).map(|index| list.get_from_store(cx, index).unwrap()).collect() + Ok(T::load_list(cx, &list)?.into()) } } )*) @@ -694,6 +734,43 @@ macro_rules! integers { *cx.get(offset) = self.to_le_bytes(); Ok(()) } + + fn store_list( + cx: &mut LowerContext<'_, T>, + ty: InterfaceType, + offset: usize, + items: &[Self], + ) -> Result<()> { + debug_assert!(matches!(ty, InterfaceType::$ty)); + + // Double-check that the CM alignment is at least the host's + // alignment for this type which should be true for all + // platforms. + assert!((Self::ALIGN32 as usize) >= mem::align_of::()); + + // Slice `cx`'s memory to the window that we'll be modifying. + // This should all have already been verified in terms of + // alignment and sizing meaning that these assertions here are + // not truly necessary but are instead double-checks. + // + // Note that we're casting a `[u8]` slice to `[Self]` with + // `align_to_mut` which is not safe in general but is safe in + // our specific case as all `u8` patterns are valid `Self` + // patterns since `Self` is an integral type. + let dst = &mut cx.as_slice_mut()[offset..][..items.len() * Self::SIZE32]; + let (before, middle, end) = unsafe { dst.align_to_mut::() }; + assert!(before.is_empty() && end.is_empty()); + assert_eq!(middle.len(), items.len()); + + // And with all that out of the way perform the copying loop. + // This is not a `copy_from_slice` because endianness needs to + // be handled here, but LLVM should pretty easily transform this + // into a memcpy on little-endian platforms. + for (dst, src) in middle.iter_mut().zip(items) { + *dst = src.to_le(); + } + Ok(()) + } } unsafe impl Lift for $primitive { @@ -709,6 +786,15 @@ macro_rules! integers { debug_assert!((bytes.as_ptr() as usize) % Self::SIZE32 == 0); Ok($primitive::from_le_bytes(bytes.try_into().unwrap())) } + + fn load_list(cx: &mut LiftContext<'_>, list: &WasmList) -> Result> { + Ok( + list._as_le_slice(cx.memory()) + .iter() + .map(|i| Self::from_le(*i)) + .collect(), + ) + } } )*) } @@ -1310,11 +1396,7 @@ where .checked_mul(elem_size) .ok_or_else(|| anyhow!("size overflow copying a list"))?; let ptr = cx.realloc(0, 0, T::ALIGN32, size)?; - let mut cur = ptr; - for item in list { - item.store(cx, ty, cur)?; - cur += elem_size; - } + T::store_list(cx, ty, ptr, list)?; Ok((ptr, list.len())) } @@ -1454,9 +1536,14 @@ macro_rules! raw_wasm_list_accessors { /// Panics if the `store` provided is not the one from which this /// slice originated. pub fn as_le_slice<'a, T: 'a>(&self, store: impl Into>) -> &'a [$i] { + let memory = self.options.memory(store.into().0); + self._as_le_slice(memory) + } + + fn _as_le_slice<'a>(&self, all_of_memory: &'a [u8]) -> &'a [$i] { // See comments in `WasmList::get` for the panicking indexing let byte_size = self.len * mem::size_of::<$i>(); - let bytes = &self.options.memory(store.into().0)[self.ptr..][..byte_size]; + let bytes = &all_of_memory[self.ptr..][..byte_size]; // The canonical ABI requires that everything is aligned to its // own size, so this should be an aligned array. Furthermore the