From a0b6a2c38e42d55d80b54c2c106eddde544251d0 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 20 Sep 2023 15:57:12 -0700 Subject: [PATCH 1/3] Add memory protection keys (MPK) In order to use MPK on an x86_64 Linux system, we need access to the underlying `pkey_*` system calls (`sys`), control of the x86 PKRU register (`pkru`), and a way of determining if MPK is even supported (`is_supported`). These various parts are wrapped in a `ProtectionKey` abstraction along with a `ProtectionMask` that can be used `allow` the CPU to access protected regions. --- crates/runtime/src/mpk/disabled.rs | 39 +++++++ crates/runtime/src/mpk/enabled.rs | 167 +++++++++++++++++++++++++++++ crates/runtime/src/mpk/mod.rs | 41 +++++++ crates/runtime/src/mpk/pkru.rs | 98 +++++++++++++++++ crates/runtime/src/mpk/sys.rs | 122 +++++++++++++++++++++ 5 files changed, 467 insertions(+) create mode 100644 crates/runtime/src/mpk/disabled.rs create mode 100644 crates/runtime/src/mpk/enabled.rs create mode 100644 crates/runtime/src/mpk/mod.rs create mode 100644 crates/runtime/src/mpk/pkru.rs create mode 100644 crates/runtime/src/mpk/sys.rs diff --git a/crates/runtime/src/mpk/disabled.rs b/crates/runtime/src/mpk/disabled.rs new file mode 100644 index 000000000000..777fe3278e5d --- /dev/null +++ b/crates/runtime/src/mpk/disabled.rs @@ -0,0 +1,39 @@ +//! Noop implementations of MPK primitives for environments that do not support +//! the feature. + +#![allow(missing_docs)] + +use anyhow::Result; + +pub fn is_supported() -> bool { + false +} +pub fn keys() -> &'static [ProtectionKey] { + &[] +} +pub fn allow(_: ProtectionMask) {} + +#[derive(Clone, Copy, Debug)] +pub struct ProtectionKey; +impl ProtectionKey { + pub fn protect(&self, _: &mut [u8]) -> Result<()> { + Ok(()) + } + pub fn as_stripe(&self) -> usize { + 0 + } +} + +#[derive(Clone, Copy, Debug)] +pub struct ProtectionMask; +impl ProtectionMask { + pub fn all() -> Self { + Self + } + pub fn zero() -> Self { + Self + } + pub fn or(self, _: ProtectionKey) -> Self { + Self + } +} diff --git a/crates/runtime/src/mpk/enabled.rs b/crates/runtime/src/mpk/enabled.rs new file mode 100644 index 000000000000..087a685c3f25 --- /dev/null +++ b/crates/runtime/src/mpk/enabled.rs @@ -0,0 +1,167 @@ +//! + +use super::{pkru, sys}; +use anyhow::{Context, Result}; +use std::sync::OnceLock; + +/// Check if the MPK feature is supported. +pub fn is_supported() -> bool { + cfg!(target_os = "linux") && cfg!(target_arch = "x86_64") && pkru::has_cpuid_bit_set() + // TODO: we cannot check CR4 due to privilege +} + +/// Allocate all protection keys available to this process. +/// +/// This asks the kernel for all available keys (we expect 1-15; 0 is +/// kernel-reserved) in a thread-safe way. This avoids interference when +/// multiple threads try to allocate keys at the same time (e.g., during +/// testing). It also ensures that a single copy of the keys are reserved for +/// the lifetime of the process. +/// +/// TODO: this is not the best-possible design. This creates global state that +/// would prevent any other code in the process from using protection keys; the +/// `KEYS` are never deallocated from the system with `pkey_dealloc`. +pub fn keys() -> &'static [ProtectionKey] { + let keys = KEYS.get_or_init(|| { + let mut allocated = vec![]; + if is_supported() { + while let Ok(key_id) = sys::pkey_alloc(0, 0) { + debug_assert!(key_id < 16); + // UNSAFETY: here we unsafely assume that the system-allocated pkey + // will exist forever. + let pkey = ProtectionKey(key_id); + debug_assert_eq!(pkey.as_stripe(), allocated.len()); + allocated.push(pkey); + } + } + allocated + }); + &keys +} +static KEYS: OnceLock> = OnceLock::new(); + +/// Only allow access to pages marked by the keys set in `mask`. +/// +/// Any accesses to pages marked by another key will result in a `SIGSEGV` +/// fault. +pub fn allow(mask: ProtectionMask) { + let mut allowed = 0; + for i in 0..16 { + if mask.0 & (1 << i) == 1 { + allowed |= 0b11 << (i * 2); + } + } + + let previous = pkru::read(); + pkru::write(pkru::DISABLE_ACCESS ^ allowed); + log::debug!("PKRU change: {:#034b} => {:#034b}", previous, pkru::read()); +} + +/// An MPK protection key. +/// +/// The expected usage is: +/// - allocate a new key with [`Pkey::new`] +/// - mark some regions of memory as accessible with [`Pkey::protect`] +/// - [`allow`] or disallow access to the memory regions using a +/// [`ProtectionMask`]; any accesses to unmarked pages result in a fault +/// - drop the key +/// +/// Since this kernel is allocated from the kernel, we must inform the kernel +/// when it is dropped. Similarly, to retrieve all available protection keys, +/// one must request them from the kernel (e.g., call [`Pkey::new`] until it +/// fails). +/// +/// Because MPK may not be available on all systems, [`Pkey`] wraps an `Option` +/// that will always be `None` if MPK is not supported. The idea here is that +/// the API can remain the same regardless of MPK support. +#[derive(Clone, Copy, Debug)] +pub struct ProtectionKey(u32); + +impl ProtectionKey { + /// Mark a page as protected by this [`Pkey`]. + /// + /// This "colors" the pages of `region` via a kernel `pkey_mprotect` call to + /// only allow reads and writes when this [`Pkey`] is activated (see + /// [`Pkey::activate`]). + /// + /// # Errors + /// + /// This will fail if the region is not page aligned or for some unknown + /// kernel reason. + pub fn protect(&self, region: &mut [u8]) -> Result<()> { + let addr = region.as_mut_ptr() as usize; + let len = region.len(); + let prot = sys::PROT_READ | sys::PROT_WRITE; + sys::pkey_mprotect(addr, len, prot, self.0).with_context(|| { + format!( + "failed to mark region with pkey (addr = {addr:#x}, len = {len}, prot = {prot:#b})" + ) + }) + } + + /// Convert the [`Pkey`] to its 0-based index; this is useful for + /// determining which allocation "stripe" a key belongs to. + /// + /// This function assumes that the kernel has allocated key 0 for itself. + pub fn as_stripe(&self) -> usize { + debug_assert!(self.0 != 0); + self.0 as usize - 1 + } +} + +/// A bit field indicating which protection keys should be *allowed*. +/// +/// When bit `n` is set, it means the protection key is allowed--conversely, +/// protection is disabled for this key. +pub struct ProtectionMask(u16); +impl ProtectionMask { + /// Allow access from all protection keys. + pub fn all() -> Self { + Self(u16::MAX) + } + + /// Only allow access to memory protected with protection key 0; note that + /// this does not mean "none" but rather allows access from the default + /// kernel protection key. + pub fn zero() -> Self { + Self(1) + } + + /// Include `pkey` as another allowed protection key in the mask. + pub fn or(self, pkey: ProtectionKey) -> Self { + Self(self.0 | 1 << pkey.0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn check_is_supported() { + println!("is pku supported = {}", is_supported()); + } + + #[test] + fn check_initialized_keys() { + if is_supported() { + assert!(!keys().is_empty()) + } + } + + #[test] + fn check_invalid_mark() { + let pkey = keys()[0]; + let unaligned_region = unsafe { + let addr = 1 as *mut u8; // this is not page-aligned! + let len = 1; + std::slice::from_raw_parts_mut(addr, len) + }; + let result = pkey.protect(unaligned_region); + assert!(result.is_err()); + assert_eq!( + result.unwrap_err().to_string(), + "failed to mark region with pkey (addr = 0x1, len = 1, prot = 0b11)" + ); + } +} diff --git a/crates/runtime/src/mpk/mod.rs b/crates/runtime/src/mpk/mod.rs new file mode 100644 index 000000000000..81156183cf2a --- /dev/null +++ b/crates/runtime/src/mpk/mod.rs @@ -0,0 +1,41 @@ +//! Memory Protection Keys (MPK) implementation for use in striped memory +//! allocation. +//! +//! MPK is an x86 feature available on relatively recent versions of Intel and +//! AMD CPUs. In Linux, this feature is named `pku` (protection keys userspace) +//! and consists of three new system calls: `pkey_alloc`, `pkey_free`, and +//! `pkey_mprotect` (see the [Linux documentation]). This crate provides an +//! abstraction, [`Pkey`], that the [pooling allocator] applies to contiguous +//! memory allocations, allowing it to avoid guard pages in some cases and more +//! efficiently use memory. This technique was first presented in a 2022 paper: +//! [Segue and ColorGuard: Optimizing SFI Performance and Scalability on Modern +//! x86][colorguard]. +//! +//! [pooling allocator]: crate::PoolingInstanceAllocator +//! [Linux documentation]: +//! https://www.kernel.org/doc/html/latest/core-api/protection-keys.html +//! [colorguard]: https://plas2022.github.io/files/pdf/SegueColorGuard.pdf +//! +//! On x86_64 Linux systems, this module implements the various parts necessary +//! to use MPK in Wasmtime: +//! - [`is_supported`] indicates whether the feature is available at runtime +//! - [`Pkey`] provides safe access to the kernel-allocated protection keys +//! - the `sys` module bridges the gap to Linux's `pkey_*` system calls +//! - the `pkru` module controls the x86 `PKRU` register (and other CPU state) +//! +//! On any other kind of machine, this module exposes noop implementations of +//! the public interface. + +#[cfg(all(target_arch = "x86_64", target_os = "linux"))] +mod enabled; +#[cfg(all(target_arch = "x86_64", target_os = "linux"))] +mod pkru; +#[cfg(all(target_arch = "x86_64", target_os = "linux"))] +mod sys; +#[cfg(all(target_arch = "x86_64", target_os = "linux"))] +pub use enabled::{allow, is_supported, keys, ProtectionKey, ProtectionMask}; + +#[cfg(not(all(target_arch = "x86_64", target_os = "linux")))] +mod disabled; +#[cfg(not(all(target_arch = "x86_64", target_os = "linux")))] +pub use disabled::{allow, is_supported, keys, ProtectionKey, ProtectionMask}; diff --git a/crates/runtime/src/mpk/pkru.rs b/crates/runtime/src/mpk/pkru.rs new file mode 100644 index 000000000000..a7e3bcde185f --- /dev/null +++ b/crates/runtime/src/mpk/pkru.rs @@ -0,0 +1,98 @@ +//! Control access to the x86 `PKRU` register. +//! +//! As documented in the Intel Software Development Manual, vol 3a, section 2.7, +//! the 32 bits of the `PKRU` register laid out as follows (note the +//! little-endianness): +//! +//! ```text +//! ┌───┬───┬───┬───┬───┬───┐ +//! │...│AD2│WD1│AD1│WD0│AD0│ +//! └───┴───┴───┴───┴───┴───┘ +//! ``` +//! +//! - `ADn = 1` means "access disable key `n`"--no reads or writes allowed to +//! pages marked with key `n`. +//! - `WDn = 1` means "write disable key `n`"--only reads are prevented to pages +//! marked with key `n` +//! - it is unclear what it means to have both `ADn` and `WDn` set +//! +//! Note that this only handles the user-mode `PKRU` register; there is an +//! equivalent supervisor-mode MSR, `IA32_PKRS`. + +use core::arch::asm; + +/// This `PKRU` register mask allows access to any pages marked with any +/// key--in other words, reading and writing is permitted to all pages. +#[cfg(test)] +const ALLOW_ACCESS: u32 = 0; + +/// This `PKRU` register mask disables access to any page marked with any +/// key--in other words, no reading or writing to all pages. +pub const DISABLE_ACCESS: u32 = 0b11111111_11111111_11111111_11111111; + +/// Read the value of the `PKRU` register. +pub fn read() -> u32 { + // ECX must be 0 to prevent a general protection exception (#GP). + let ecx: u32 = 0; + let pkru: u32; + unsafe { + asm!("rdpkru", in("ecx") ecx, out("eax") pkru, out("edx") _, + options(nomem, nostack, preserves_flags)); + } + return pkru; +} + +/// Write a value to the `PKRU` register. +pub fn write(pkru: u32) { + // Both ECX and EDX must be 0 to prevent a general protection exception + // (#GP). + let ecx: u32 = 0; + let edx: u32 = 0; + unsafe { + asm!("wrpkru", in("eax") pkru, in("ecx") ecx, in("edx") edx, + options(nomem, nostack, preserves_flags)); + } +} + +/// Check the `ECX.PKU` flag (bit 3) of the `07h` `CPUID` leaf; see the +/// Intel Software Development Manual, vol 3a, section 2.7. +pub fn has_cpuid_bit_set() -> bool { + let result = unsafe { std::arch::x86_64::__cpuid(0x07) }; + (result.ecx & 0b100) != 0 +} + +/// Check that the `CR4.PKE` flag (bit 22) is set; see the Intel Software +/// Development Manual, vol 3a, section 2.7. This register can only be +/// accessed from privilege level 0. +#[cfg(test)] +fn has_cr4_bit_set() -> bool { + let cr4: u64; + unsafe { + asm!("mov {}, cr4", out(reg) cr4, options(nomem, nostack, preserves_flags)); + } + (cr4 & (1 << 22)) != 0 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[ignore = "cannot be run with other tests that munge the PKRU register"] + fn check_read() { + assert_eq!(read(), DISABLE_ACCESS ^ 1); + // By default, the Linux kernel only allows a process to access key 0, + // the default kernel key. + } + + #[test] + fn check_roundtrip() { + let pkru = read(); + // Allow access to pages marked with any key. + write(0); + assert_eq!(read(), ALLOW_ACCESS); + // Restore the original value. + write(pkru); + assert_eq!(read(), pkru); + } +} diff --git a/crates/runtime/src/mpk/sys.rs b/crates/runtime/src/mpk/sys.rs new file mode 100644 index 000000000000..5d357314a7e2 --- /dev/null +++ b/crates/runtime/src/mpk/sys.rs @@ -0,0 +1,122 @@ +//! Expose the `pkey_*` Linux system calls. See the kernel documentation for +//! more information: +//! - [`pkeys`] overview +//! - [`pkey_alloc`] (with `pkey_free`) +//! - [`pkey_mprotect`] +//! - `pkey_set` is implemented directly in assembly. +//! +//! [`pkey_alloc`]: https://man7.org/linux/man-pages/man2/pkey_alloc.2.html +//! [`pkey_mprotect`]: https://man7.org/linux/man-pages/man2/pkey_mprotect.2.html +//! [`pkeys`]: https://man7.org/linux/man-pages/man7/pkeys.7.html + +use crate::page_size; +use anyhow::{anyhow, Result}; + +/// Protection mask allowing reads of pkey-protected memory (see `prot` in +/// [`pkey_mprotect`]). +pub const PROT_READ: u32 = libc::PROT_READ as u32; // == 0b0001. + +/// Protection mask allowing writes of pkey-protected memory (see `prot` in +/// [`pkey_mprotect`]). +pub const PROT_WRITE: u32 = libc::PROT_WRITE as u32; // == 0b0010; + +/// Allocate a new protection key in the Linux kernel ([docs]); returns the +/// key ID. +/// +/// [docs]: https://man7.org/linux/man-pages/man2/pkey_alloc.2.html +/// +/// Each process has its own separate pkey index; e.g., if process `m` +/// allocates key 1, process `n` can as well. +pub fn pkey_alloc(flags: u32, access_rights: u32) -> Result { + debug_assert_eq!(flags, 0); // reserved for future use--must be 0. + let result = unsafe { libc::syscall(libc::SYS_pkey_alloc, flags, access_rights) }; + if result >= 0 { + Ok(result.try_into().expect("TODO")) + } else { + debug_assert_eq!(result, -1); // only this error result is expected. + Err(anyhow!(unsafe { errno_as_string() })) + } +} + +/// Free a kernel protection key ([docs]). +/// +/// [docs]: https://man7.org/linux/man-pages/man2/pkey_alloc.2.html +#[allow(dead_code)] +pub fn pkey_free(key: u32) -> Result<()> { + let result = unsafe { libc::syscall(libc::SYS_pkey_free, key) }; + if result == 0 { + Ok(()) + } else { + debug_assert_eq!(result, -1); // only this error result is expected. + Err(anyhow!(unsafe { errno_as_string() })) + } +} + +/// Change the access protections for a page-aligned memory region ([docs]). +/// +/// [docs]: https://man7.org/linux/man-pages/man2/pkey_mprotect.2.html +pub fn pkey_mprotect(addr: usize, len: usize, prot: u32, key: u32) -> Result<()> { + let page_size = page_size(); + if addr % page_size != 0 { + log::warn!( + "memory must be page-aligned for MPK (addr = {addr:#x}, page size = {page_size}" + ); + } + let result = unsafe { libc::syscall(libc::SYS_pkey_mprotect, addr, len, prot, key) }; + if result == 0 { + Ok(()) + } else { + debug_assert_eq!(result, -1); // only this error result is expected. + Err(anyhow!(unsafe { errno_as_string() })) + } +} + +/// Helper function for retrieving the libc error message for the current +/// error (see GNU libc's ["Checking for Errors"] documentation). +/// +/// ["Checking for Errors"]: https://www.gnu.org/software/libc/manual/html_node/Checking-for-Errors.html +unsafe fn errno_as_string() -> String { + let errno = *libc::__errno_location(); + let err_ptr = libc::strerror(errno); + std::ffi::CStr::from_ptr(err_ptr) + .to_string_lossy() + .into_owned() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[ignore = "cannot be run when keys() has already allocated all keys"] + #[test] + fn check_allocate_and_free() { + let key = pkey_alloc(0, 0).unwrap(); + assert_eq!(key, 1); + // It may seem strange to assert the key ID here, but we already + // make some assumptions: + // 1. we are running on Linux with `pku` enabled + // 2. Linux will allocate key 0 for itself + // 3. we are running this test in non-MPK mode and no one else is + // using pkeys + // If these assumptions are incorrect, this test can be removed. + pkey_free(key).unwrap() + } + + #[test] + fn check_invalid_free() { + let result = pkey_free(42); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().to_string(), "Invalid argument"); + } + + #[test] + #[should_panic] + fn check_invalid_alloc_flags() { + pkey_alloc(42, 0).unwrap(); + } + + #[test] + fn check_invalid_alloc_rights() { + assert!(pkey_alloc(0, 42).is_err()); + } +} From 79240f89ecf3544e1e4747ef60464a4478246b2b Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 20 Sep 2023 16:02:59 -0700 Subject: [PATCH 2/3] Integrate MPK into the pooling allocator This change adds "stripes" to the pooling allocator's `MemoryPool`. Now, when requesting a slot in which to instantiate, the user (i.e., `InstanceAllocationRequest`) will be transparently assigned to one of the stripes, each of which is associated with a protection key. The user can also request a specific protection key to use, which will override the original "find me a slot logic". This has implications for how instances get allocated: once a store is assigned a protection key, it will only allocate requests with that key, limiting how many slots it has access to. E.g., if 15 keys are active, the store can only ever access 1/15th of the slots. This change also includes a tri-bool configuration field, `memory_protection_keys`, which is disabled by default for the time being. --- Cargo.lock | 24 +- crates/runtime/Cargo.toml | 6 +- .../allocator/pooling/memory_pool.txt | 8 + crates/runtime/src/instance/allocator.rs | 27 +- .../src/instance/allocator/on_demand.rs | 16 + .../runtime/src/instance/allocator/pooling.rs | 100 ++- .../allocator/pooling/index_allocator.rs | 30 +- .../instance/allocator/pooling/memory_pool.rs | 587 +++++++++++++++--- crates/wasmtime/src/config.rs | 36 +- crates/wasmtime/src/instance.rs | 1 + crates/wasmtime/src/store.rs | 30 +- crates/wasmtime/src/trampoline.rs | 1 + crates/wasmtime/src/trampoline/memory.rs | 14 + tests/all/pooling_allocator.rs | 2 +- 14 files changed, 768 insertions(+), 114 deletions(-) create mode 100644 crates/runtime/proptest-regressions/instance/allocator/pooling/memory_pool.txt diff --git a/Cargo.lock b/Cargo.lock index 9887f11ac7e3..4fad9916057b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1470,7 +1470,7 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" dependencies = [ - "quick-error 1.2.3", + "quick-error", ] [[package]] @@ -1843,6 +1843,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -2036,22 +2037,22 @@ dependencies = [ [[package]] name = "proptest" -version = "1.0.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0d9cc07f18492d879586c92b485def06bc850da3118075cd45d50e9c95b0e5" +checksum = "4e35c06b98bf36aba164cc17cb25f7e232f5c4aeea73baa14b8a9f0d92dbfa65" dependencies = [ "bit-set", "bitflags 1.3.2", "byteorder", "lazy_static", "num-traits", - "quick-error 2.0.1", "rand", "rand_chacha", "rand_xorshift", "regex-syntax 0.6.25", "rusty-fork", "tempfile", + "unarray", ] [[package]] @@ -2080,12 +2081,6 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" -[[package]] -name = "quick-error" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" - [[package]] name = "quote" version = "1.0.29" @@ -2349,7 +2344,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" dependencies = [ "fnv", - "quick-error 1.2.3", + "quick-error", "tempfile", "wait-timeout", ] @@ -2859,6 +2854,12 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicase" version = "2.6.0" @@ -3681,6 +3682,7 @@ dependencies = [ "memoffset", "once_cell", "paste", + "proptest", "rand", "rustix 0.38.8", "sptr", diff --git a/crates/runtime/Cargo.toml b/crates/runtime/Cargo.toml index 50002b88dead..63fb6bc23bef 100644 --- a/crates/runtime/Cargo.toml +++ b/crates/runtime/Cargo.toml @@ -50,6 +50,7 @@ features = [ [dev-dependencies] once_cell = { workspace = true } +proptest = "1.2.0" [build-dependencies] cc = "1.0" @@ -61,9 +62,6 @@ async = ["wasmtime-fiber"] # Enables support for the pooling instance allocator pooling-allocator = [] -component-model = [ - "wasmtime-environ/component-model", - "dep:encoding_rs", -] +component-model = ["wasmtime-environ/component-model", "dep:encoding_rs"] wmemcheck = [] diff --git a/crates/runtime/proptest-regressions/instance/allocator/pooling/memory_pool.txt b/crates/runtime/proptest-regressions/instance/allocator/pooling/memory_pool.txt new file mode 100644 index 000000000000..f95bfcd3875f --- /dev/null +++ b/crates/runtime/proptest-regressions/instance/allocator/pooling/memory_pool.txt @@ -0,0 +1,8 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 696808084287d5d58b85c60c4720227ab4dd83ada7be6841a67162023aaf4914 # shrinks to c = SlabConstraints { max_memory_bytes: 0, num_memory_slots: 1, num_pkeys_available: 0, guard_bytes: 9223372036854775808 } +cc cf9f6c36659f7f56ed8ea646e8c699cbf46708cef6911cdd376418ad69ea1388 # shrinks to c = SlabConstraints { max_memory_bytes: 14161452635954640438, num_memory_slots: 0, num_pkeys_available: 0, guard_bytes: 4285291437754911178 } diff --git a/crates/runtime/src/instance/allocator.rs b/crates/runtime/src/instance/allocator.rs index 35c1201495d6..a8cb2b5d0107 100644 --- a/crates/runtime/src/instance/allocator.rs +++ b/crates/runtime/src/instance/allocator.rs @@ -1,6 +1,7 @@ use crate::imports::Imports; use crate::instance::{Instance, InstanceHandle}; use crate::memory::Memory; +use crate::mpk::ProtectionKey; use crate::table::Table; use crate::{CompiledModuleId, ModuleRuntimeInfo, Store}; use anyhow::{anyhow, bail, Result}; @@ -23,7 +24,9 @@ pub use self::on_demand::OnDemandInstanceAllocator; #[cfg(feature = "pooling-allocator")] mod pooling; #[cfg(feature = "pooling-allocator")] -pub use self::pooling::{InstanceLimits, PoolingInstanceAllocator, PoolingInstanceAllocatorConfig}; +pub use self::pooling::{ + AutoEnabled, InstanceLimits, PoolingInstanceAllocator, PoolingInstanceAllocatorConfig, +}; /// Represents a request for a new runtime instance. pub struct InstanceAllocationRequest<'a> { @@ -59,6 +62,10 @@ pub struct InstanceAllocationRequest<'a> { /// Indicates '--wmemcheck' flag. pub wmemcheck: bool, + + /// Request that the instance's memories be protected by a specific + /// protection key. + pub pkey: Option, } /// A pointer to a Store. This Option<*mut dyn Store> is wrapped in a struct @@ -267,6 +274,24 @@ pub unsafe trait InstanceAllocatorImpl { /// Primarily present for the pooling allocator to remove mappings of /// this module from slots in linear memory. fn purge_module(&self, module: CompiledModuleId); + + /// Use the next available protection key. + /// + /// The pooling allocator can use memory protection keys (MPK) for + /// compressing the guard regions protecting against OOB. Each + /// pool-allocated store needs its own key. + fn next_available_pkey(&self) -> Option; + + /// Restrict access to memory regions protected by `pkey`. + /// + /// This is useful for the pooling allocator, which can use memory + /// protection keys (MPK). Note: this may still allow access to other + /// protection keys, such as the default kernel key; see implementations of + /// this. + fn restrict_to_pkey(&self, pkey: ProtectionKey); + + /// Allow access to memory regions protected by any protection key. + fn allow_all_pkeys(&self); } /// A thing that can allocate instances. diff --git a/crates/runtime/src/instance/allocator/on_demand.rs b/crates/runtime/src/instance/allocator/on_demand.rs index ad4d951bf57a..3874affb7ccc 100644 --- a/crates/runtime/src/instance/allocator/on_demand.rs +++ b/crates/runtime/src/instance/allocator/on_demand.rs @@ -3,6 +3,7 @@ use super::{ }; use crate::instance::RuntimeMemoryCreator; use crate::memory::{DefaultMemoryCreator, Memory}; +use crate::mpk::ProtectionKey; use crate::table::Table; use crate::CompiledModuleId; use anyhow::Result; @@ -151,4 +152,19 @@ unsafe impl InstanceAllocatorImpl for OnDemandInstanceAllocator { } fn purge_module(&self, _: CompiledModuleId) {} + + fn next_available_pkey(&self) -> Option { + // The on-demand allocator cannot use protection keys--it requires + // back-to-back allocation of memory slots that this allocator cannot + // guarantee. + None + } + + fn restrict_to_pkey(&self, _: ProtectionKey) { + // The on-demand allocator cannot use protection keys. + } + + fn allow_all_pkeys(&self) { + // The on-demand allocator cannot use protection keys. + } } diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index 214e66c8c5f5..202fc1850408 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -1,11 +1,62 @@ //! Implements the pooling instance allocator. //! -//! The pooling instance allocator maps memory in advance -//! and allocates instances, memories, tables, and stacks from -//! a pool of available resources. +//! The pooling instance allocator maps memory in advance and allocates +//! instances, memories, tables, and stacks from a pool of available resources. +//! Using the pooling instance allocator can speed up module instantiation when +//! modules can be constrained based on configurable limits +//! ([`InstanceLimits`]). Each new instance is stored in a "slot"; as instances +//! are allocated and freed, these slots are either filled or emptied: //! -//! Using the pooling instance allocator can speed up module instantiation -//! when modules can be constrained based on configurable limits. +//! ```text +//! ┌──────┬──────┬──────┬──────┬──────┐ +//! │Slot 0│Slot 1│Slot 2│Slot 3│......│ +//! └──────┴──────┴──────┴──────┴──────┘ +//! ``` +//! +//! Note that these slots are a useful abstraction but not exactly how this is +//! mapped to memory in fact. Each new instance _does_ get associated with a +//! slot number (see uses of `index` and [`SlotId`] in this module) but the +//! parts of the instances are stored in separate pools: memories in the +//! [`MemoryPool`], tables in the [`TablePool`], etc. What ties these various +//! parts together is the slot number generated by an [`IndexAllocator`] . +//! +//! The [`MemoryPool`] protects Wasmtime from out-of-bounds memory accesses by +//! inserting inaccessible guard regions between memory slots. The +//! [`MemoryPool`] documentation has a more in-depth chart but one can think of +//! memories being laid out like the following: +//! +//! ```text +//! ┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐ +//! │Guard│Mem 0│Guard│Mem 1│Guard│Mem 2│.....│Guard│ +//! └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘ +//! ``` +//! +//! To complicate matters, each instance can have multiple memories, multiple +//! tables, etc. You might think these would be stored consecutively in their +//! respective pools (for instance `n`, table 0 is at table pool slot `n + 0` +//! and table 1 is at `n + 1`), but for memories this is not the case. With +//! protection keys enabled, memories do not need interleaved guard regions +//! because the protection key will signal a fault if the wrong memory is +//! accessed. Instead, the pooling allocator "stripes" the memories with +//! different protection keys. +//! +//! This concept, dubbed [ColorGuard] in the original paper, relies on careful +//! calculation of the memory sizes to prevent any "overlapping access": there +//! are limited protection keys available (15) so the next memory using the same +//! key must be at least as far away as the guard region we would insert +//! otherwise. This ends up looking like the following, where a store for +//! instance 0 (`I0`) "stripes" two memories (`M0` and `M1`) with the same +//! protection key 1 and far enough apart to signal an OOB access: +//! +//! ```text +//! ┌─────┬─────┬─────┬─────┬────────────────┬─────┬─────┬─────┐ +//! │.....│I0:M1│.....│.....│..│I0:M2│.....│.....│ +//! ├─────┼─────┼─────┼─────┼────────────────┼─────┼─────┼─────┤ +//! │.....│key 1│key 2│key 3│.....│key 1│key 2│.....│ +//! └─────┴─────┴─────┴─────┴────────────────┴─────┴─────┴─────┘ +//! ``` +//! +//! [ColorGuard]: https://plas2022.github.io/files/pdf/SegueColorGuard.pdf mod index_allocator; mod memory_pool; @@ -27,7 +78,11 @@ cfg_if::cfg_if! { use super::{ InstanceAllocationRequest, InstanceAllocatorImpl, MemoryAllocationIndex, TableAllocationIndex, }; -use crate::{instance::Instance, CompiledModuleId, Memory, Table}; +use crate::{ + instance::Instance, + mpk::{self, ProtectionKey, ProtectionMask}, + CompiledModuleId, Memory, Table, +}; use anyhow::{bail, Result}; use memory_pool::MemoryPool; use std::{ @@ -162,6 +217,8 @@ pub struct PoolingInstanceAllocatorConfig { pub linear_memory_keep_resident: usize, /// Same as `linear_memory_keep_resident` but for tables. pub table_keep_resident: usize, + /// Whether to enable memory protection keys. + pub memory_protection_keys: AutoEnabled, } impl Default for PoolingInstanceAllocatorConfig { @@ -174,15 +231,30 @@ impl Default for PoolingInstanceAllocatorConfig { async_stack_keep_resident: 0, linear_memory_keep_resident: 0, table_keep_resident: 0, + memory_protection_keys: AutoEnabled::Disable, } } } +/// Describe the tri-state configuration of memory protection keys (MPK). +#[derive(Clone, Copy, Debug)] +pub enum AutoEnabled { + /// Use MPK if supported by the current system; fall back to guard regions + /// otherwise. + Auto, + /// Use MPK or fail if not supported. + Enable, + /// Do not use MPK. + Disable, +} + /// Implements the pooling instance allocator. /// -/// This allocator internally maintains pools of instances, memories, tables, and stacks. +/// This allocator internally maintains pools of instances, memories, tables, +/// and stacks. /// -/// Note: the resource pools are manually dropped so that the fault handler terminates correctly. +/// Note: the resource pools are manually dropped so that the fault handler +/// terminates correctly. #[derive(Debug)] pub struct PoolingInstanceAllocator { limits: InstanceLimits, @@ -533,6 +605,18 @@ unsafe impl InstanceAllocatorImpl for PoolingInstanceAllocator { fn purge_module(&self, module: CompiledModuleId) { self.memories.purge_module(module); } + + fn next_available_pkey(&self) -> Option { + self.memories.next_available_pkey() + } + + fn restrict_to_pkey(&self, pkey: ProtectionKey) { + mpk::allow(ProtectionMask::zero().or(pkey)); + } + + fn allow_all_pkeys(&self) { + mpk::allow(ProtectionMask::all()); + } } #[cfg(test)] diff --git a/crates/runtime/src/instance/allocator/pooling/index_allocator.rs b/crates/runtime/src/instance/allocator/pooling/index_allocator.rs index d4079680be89..7d35f8e3f757 100644 --- a/crates/runtime/src/instance/allocator/pooling/index_allocator.rs +++ b/crates/runtime/src/instance/allocator/pooling/index_allocator.rs @@ -167,6 +167,12 @@ impl ModuleAffinityIndexAllocator { })) } + /// How many slots can this allocator allocate? + pub fn len(&self) -> usize { + let inner = self.0.lock().unwrap(); + inner.slot_state.len() + } + /// Are zero slots in use right now? pub fn is_empty(&self) -> bool { let inner = self.0.lock().unwrap(); @@ -299,8 +305,16 @@ impl ModuleAffinityIndexAllocator { }); } - /// For testing only, we want to be able to assert what is on the - /// single freelist, for the policies that keep just one. + /// Return the number of empty slots available in this allocator. + #[cfg(test)] + pub fn num_empty_slots(&self) -> usize { + let inner = self.0.lock().unwrap(); + let total_slots = inner.slot_state.len(); + (total_slots - inner.last_cold as usize) + inner.unused_warm_slots as usize + } + + /// For testing only, we want to be able to assert what is on the single + /// freelist, for the policies that keep just one. #[cfg(test)] #[allow(unused)] pub(crate) fn testing_freelist(&self) -> Vec { @@ -311,8 +325,8 @@ impl ModuleAffinityIndexAllocator { .collect() } - /// For testing only, get the list of all modules with at least - /// one slot with affinity for that module. + /// For testing only, get the list of all modules with at least one slot + /// with affinity for that module. #[cfg(test)] pub(crate) fn testing_module_affinity_list(&self) -> Vec { let inner = self.0.lock().unwrap(); @@ -475,7 +489,9 @@ mod test { fn test_next_available_allocation_strategy() { for size in 0..20 { let state = ModuleAffinityIndexAllocator::new(size, 0); + assert_eq!(state.num_empty_slots() as u32, size); for i in 0..size { + assert_eq!(state.num_empty_slots() as u32, size - i); assert_eq!(state.alloc(None).unwrap().index(), i as usize); } assert!(state.alloc(None).is_none()); @@ -496,6 +512,9 @@ mod test { assert_ne!(index1, index2); state.free(index1); + assert_eq!(state.num_empty_slots(), 99); + + // Allocate to the same `index1` slot again. let index3 = state.alloc(Some(id1)).unwrap(); assert_eq!(index3, index1); state.free(index3); @@ -512,13 +531,14 @@ mod test { // for id1, and 98 empty. Allocate 100 for id2. The first // should be equal to the one we know was previously used for // id2. The next 99 are arbitrary. - + assert_eq!(state.num_empty_slots(), 100); let mut indices = vec![]; for _ in 0..100 { indices.push(state.alloc(Some(id2)).unwrap()); } assert!(state.alloc(None).is_none()); assert_eq!(indices[0], index2); + assert_eq!(state.num_empty_slots(), 0); for i in indices { state.free(i); diff --git a/crates/runtime/src/instance/allocator/pooling/memory_pool.rs b/crates/runtime/src/instance/allocator/pooling/memory_pool.rs index 45038a21e860..8197386c3afa 100644 --- a/crates/runtime/src/instance/allocator/pooling/memory_pool.rs +++ b/crates/runtime/src/instance/allocator/pooling/memory_pool.rs @@ -1,25 +1,102 @@ +//! Implements a memory pool using a single allocated memory slab. +//! +//! The pooling instance allocator maps memory in advance and allocates +//! instances, memories, tables, and stacks from a pool of available resources. +//! Using the pooling instance allocator can speed up module instantiation when +//! modules can be constrained based on configurable limits +//! ([`InstanceLimits`]). Each new instance is stored in a "slot"; as instances +//! are allocated and freed, these slots are either filled or emptied: +//! +//! ```text +//! ┌──────┬──────┬──────┬──────┬──────┐ +//! │Slot 0│Slot 1│Slot 2│Slot 3│......│ +//! └──────┴──────┴──────┴──────┴──────┘ +//! ``` +//! +//! Note that these slots are a useful abstraction but not exactly how this is +//! mapped to memory in fact. Each new instance _does_ get associated with a +//! slot number (see uses of `index` and [`SlotId`] in this module) but the +//! parts of the instances are stored in separate pools: memories in the +//! [`MemoryPool`], tables in the [`TablePool`], etc. What ties these various +//! parts together is the slot number generated by an [`IndexAllocator`] . +//! +//! The [`MemoryPool`] protects Wasmtime from out-of-bounds memory accesses by +//! inserting inaccessible guard regions between memory slots. The +//! [`MemoryPool`] documentation has a more in-depth chart but one can think of +//! memories being laid out like the following: +//! +//! ```text +//! ┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐ +//! │Guard│Mem 0│Guard│Mem 1│Guard│Mem 2│.....│Guard│ +//! └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘ +//! ``` +//! +//! To complicate matters, each instance can have multiple memories, multiple +//! tables, etc. You might think these would be stored consecutively in their +//! respective pools (for instance `n`, table 0 is at table pool slot `n + 0` +//! and table 1 is at `n + 1`), but for memories this is not the case. With +//! protection keys enabled, memories do not need interleaved guard regions +//! because the protection key will signal a fault if the wrong memory is +//! accessed. Instead, the pooling allocator "stripes" the memories with +//! different protection keys. +//! +//! This concept, dubbed [ColorGuard] in the original paper, relies on careful +//! calculation of the memory sizes to prevent any "overlapping access": there +//! are limited protection keys available (15) so the next memory using the same +//! key must be at least as far away as the guard region we would insert +//! otherwise. This ends up looking like the following, where a store for +//! instance 0 (`I0`) "stripes" two memories (`M0` and `M1`) with the same +//! protection key 1 and far enough apart to signal an OOB access: +//! +//! ```text +//! ┌─────┬─────┬─────┬─────┬────────────────┬─────┬─────┬─────┐ +//! │.....│I0:M1│.....│.....│..│I0:M2│.....│.....│ +//! ├─────┼─────┼─────┼─────┼────────────────┼─────┼─────┼─────┤ +//! │.....│key 1│key 2│key 3│.....│key 1│key 2│.....│ +//! └─────┴─────┴─────┴─────┴────────────────┴─────┴─────┴─────┘ +//! ``` +//! +//! [ColorGuard]: https://plas2022.github.io/files/pdf/SegueColorGuard.pdf + use super::{ index_allocator::{MemoryInModule, ModuleAffinityIndexAllocator, SlotId}, MemoryAllocationIndex, }; +use crate::mpk::{self, ProtectionKey, ProtectionMask}; use crate::{ - CompiledModuleId, InstanceAllocationRequest, Memory, MemoryImageSlot, Mmap, + AutoEnabled, CompiledModuleId, InstanceAllocationRequest, Memory, MemoryImageSlot, Mmap, PoolingInstanceAllocatorConfig, }; use anyhow::{anyhow, bail, Context, Result}; use libc::c_void; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Mutex; use wasmtime_environ::{ DefinedMemoryIndex, MemoryPlan, MemoryStyle, Module, Tunables, WASM_PAGE_SIZE, }; +/// A set of allocator slots. +/// +/// The allocated slots can be split by striping them: e.g., with two stripe +/// colors 0 and 1, we would allocate all even slots using stripe 0 and all odd +/// slots using stripe 1. +/// +/// This is helpful for the use of protection keys: (a) if a request comes to +/// allocate multiple instances, we can allocate them all from the same stripe +/// and (b) if a store wants to allocate more from the same stripe it can. +#[derive(Debug)] +struct Stripe { + allocator: ModuleAffinityIndexAllocator, + pkey: Option, +} + /// Represents a pool of WebAssembly linear memories. /// /// A linear memory is divided into accessible pages and guard pages. /// /// A diagram for this struct's fields is: /// -/// ```ignore +/// ```text /// memory_size /// / /// max_accessible / memory_and_guard_size @@ -34,12 +111,15 @@ use wasmtime_environ::{ /// \ | \ /// mapping | `max_total_memories` memories /// / -/// initial_memory_offset +/// pre_slab_guard_size /// ``` #[derive(Debug)] pub struct MemoryPool { mapping: Mmap, - index_allocator: ModuleAffinityIndexAllocator, + /// This memory pool is stripe-aware. If using memory protection keys, this + /// will contain one stripe per available key; otherwise, a single stripe + /// with an empty key. + stripes: Vec, // If using a copy-on-write allocation scheme, the slot management. We // dynamically transfer ownership of a slot to a Memory when in // use. @@ -56,7 +136,7 @@ pub struct MemoryPool { // The size, in bytes, of the offset to the first linear memory in this // pool. This is here to help account for the first region of guard pages, // if desired, before the first linear memory. - initial_memory_offset: usize, + pre_slab_guard_size: usize, // The maximum number of memories that can be allocated concurrently, aka // our pool's capacity. max_total_memories: usize, @@ -74,6 +154,9 @@ pub struct MemoryPool { // // Only applicable on Linux. keep_resident: usize, + // Keep track of protection keys handed out to initialized stores; this + // allows us to round-robin the assignment of stores to stripes. + next_available_pkey: AtomicUsize, } impl MemoryPool { @@ -87,78 +170,128 @@ impl MemoryPool { ); } + let pkeys = match config.memory_protection_keys { + AutoEnabled::Auto => { + if mpk::is_supported() { + mpk::keys() + } else { + &[] + } + } + AutoEnabled::Enable => { + if mpk::is_supported() { + mpk::keys() + } else { + bail!("mpk is disabled on this system") + } + } + AutoEnabled::Disable => &[], + }; + + // This is a tricky bit of global state: when creating a memory pool + // that uses memory protection keys, we ensure here that any host code + // will have access to all keys (i.e., stripes). It's only when we enter + // the WebAssembly guest code (see `StoreInner::call_hook`) that we + // enforce which keys/stripes can be accessed. Be forewarned about the + // assumptions here: + // - we expect this "allow all" configuration to reset the default + // process state (only allow key 0) _before_ any memories are accessed + // - and we expect no other code (e.g., host-side code) to modify this + // global MPK configuration + if !pkeys.is_empty() { + mpk::allow(ProtectionMask::all()); + } + // Interpret the larger of the maximal size of memory or the static // memory bound as the size of the virtual address space reservation for // memory itself. Typically `static_memory_bound` is 4G which helps // elide most bounds checks in wasm. If `memory_pages` is larger, - // though, then this is a non-moving pooling allocator so create larger - // reservations for account for that. - let memory_size = config.limits.memory_pages.max(tunables.static_memory_bound) + // though, then this is a non-moving pooling allocator so we create + // larger reservations to account for that. + let max_memory_bytes = config.limits.memory_pages.max(tunables.static_memory_bound) * u64::from(WASM_PAGE_SIZE); - let memory_and_guard_size = - usize::try_from(memory_size + tunables.static_memory_offset_guard_size) - .map_err(|_| anyhow!("memory reservation size exceeds addressable memory"))?; - - assert!( - memory_and_guard_size % crate::page_size() == 0, - "memory size {} is not a multiple of system page size", - memory_and_guard_size - ); - - let max_total_memories = config.limits.total_memories as usize; - let initial_memory_offset = if tunables.guard_before_linear_memory { - usize::try_from(tunables.static_memory_offset_guard_size).unwrap() - } else { - 0 + // Create a slab layout and allocate it as completely inaccessible + // region to start--`PROT_NONE`. + let constraints = SlabConstraints { + max_memory_bytes: max_memory_bytes as usize, + num_memory_slots: config.limits.total_memories as usize, + num_pkeys_available: pkeys.len(), + guard_bytes: tunables.static_memory_offset_guard_size as usize, + guard_before_slots: tunables.guard_before_linear_memory, }; - - // The entire allocation here is the size of each memory (with guard - // regions) times the total number of memories in the pool. - // - // Note, though, that guard regions are required to be after each linear - // memory. If the `guard_before_linear_memory` setting is specified, - // then due to the contiguous layout of linear memories the guard pages - // after one memory are also guard pages preceding the next linear - // memory. This means that we only need to handle pre-guard-page sizes - // specially for the first linear memory, hence the - // `initial_memory_offset` variable here. If guards aren't specified - // before linear memories this is set to `0`, otherwise it's set to - // the same size as guard regions for other memories. - let allocation_size = memory_and_guard_size - .checked_mul(max_total_memories) - .and_then(|c| c.checked_add(initial_memory_offset)) - .ok_or_else(|| { - anyhow!("total size of memory reservation exceeds addressable memory") - })?; - - // Create a completely inaccessible region to start - let mapping = Mmap::accessible_reserved(0, allocation_size) + let layout = calculate(&constraints)?; + log::debug!("creating memory pool: {constraints:?} -> {layout:?}"); + let mut mapping = Mmap::accessible_reserved(0, layout.total_slab_bytes) .context("failed to create memory pool mapping")?; + // Then, stripe the memory with the available protection keys. This is + // unnecessary if there is only one stripe color. + if layout.num_stripes >= 2 { + let mut cursor = layout.pre_slab_guard_bytes; + let pkeys = &pkeys[..layout.num_stripes]; + for i in 0..constraints.num_memory_slots { + let pkey = &pkeys[i % pkeys.len()]; + let region = unsafe { mapping.slice_mut(cursor..cursor + layout.slot_bytes) }; + pkey.protect(region)?; + cursor += layout.slot_bytes; + } + debug_assert_eq!( + cursor + layout.post_slab_guard_bytes, + layout.total_slab_bytes + ); + } + let image_slots: Vec<_> = std::iter::repeat_with(|| Mutex::new(None)) - .take(max_total_memories) + .take(constraints.num_memory_slots) .collect(); - let pool = Self { - index_allocator: ModuleAffinityIndexAllocator::new( - config.limits.total_memories, + let create_stripe = |i| { + let num_slots = constraints.num_memory_slots / layout.num_stripes + + usize::from(constraints.num_memory_slots % layout.num_stripes > i); + let allocator = ModuleAffinityIndexAllocator::new( + num_slots.try_into().unwrap(), config.max_unused_warm_slots, - ), + ); + Stripe { + allocator, + pkey: pkeys.get(i).cloned(), + } + }; + + debug_assert!(layout.num_stripes > 0); + let stripes: Vec<_> = (0..layout.num_stripes) + .into_iter() + .map(create_stripe) + .collect(); + + let pool = Self { + stripes, mapping, image_slots, - memory_size: memory_size.try_into().unwrap(), - memory_and_guard_size, - initial_memory_offset, - max_total_memories, + memory_size: constraints.max_memory_bytes, + memory_and_guard_size: layout.slot_bytes, + pre_slab_guard_size: layout.pre_slab_guard_bytes, + max_total_memories: constraints.num_memory_slots, memories_per_instance: usize::try_from(config.limits.max_memories_per_module).unwrap(), max_accessible: (config.limits.memory_pages as usize) * (WASM_PAGE_SIZE as usize), keep_resident: config.linear_memory_keep_resident, + next_available_pkey: AtomicUsize::new(0), }; Ok(pool) } + /// Return a protection key that stores can use for requesting new + pub fn next_available_pkey(&self) -> Option { + let index = self.next_available_pkey.fetch_add(1, Ordering::SeqCst) % self.stripes.len(); + debug_assert!( + self.stripes.len() < 2 || self.stripes[index].pkey.is_some(), + "if we are using stripes, we cannot have an empty protection key" + ); + self.stripes[index].pkey.clone() + } + /// Validate whether this memory pool supports the given module. pub fn validate(&self, module: &Module) -> Result<()> { let memories = module.memory_plans.len() - module.num_imported_memories; @@ -201,7 +334,12 @@ impl MemoryPool { /// Are zero slots in use right now? pub fn is_empty(&self) -> bool { - self.index_allocator.is_empty() + for stripe in &self.stripes { + if !stripe.allocator.is_empty() { + return false; + } + } + true } /// Allocate a single memory for the given instance allocation request. @@ -211,21 +349,31 @@ impl MemoryPool { memory_plan: &MemoryPlan, memory_index: DefinedMemoryIndex, ) -> Result<(MemoryAllocationIndex, Memory)> { - let allocation_index = self - .index_allocator + let stripe_index = if let Some(pkey) = &request.pkey { + pkey.as_stripe() + } else { + debug_assert!(self.stripes.len() < 2); + 0 + }; + + let striped_allocation_index = self.stripes[stripe_index] + .allocator .alloc( request .runtime_info .unique_id() .map(|id| MemoryInModule(id, memory_index)), ) - .map(|slot| MemoryAllocationIndex(u32::try_from(slot.index()).unwrap())) + .map(|slot| StripedAllocationIndex(u32::try_from(slot.index()).unwrap())) .ok_or_else(|| { anyhow!( - "maximum concurrent memory limit of {} reached", - self.max_total_memories + "maximum concurrent memory limit of {} reached for stripe {}", + self.stripes[stripe_index].allocator.len(), + stripe_index ) })?; + let allocation_index = + striped_allocation_index.as_unstriped_slot_index(stripe_index, self.stripes.len()); match (|| { // Double-check that the runtime requirements of the memory are @@ -273,7 +421,9 @@ impl MemoryPool { })() { Ok(memory) => Ok((allocation_index, memory)), Err(e) => { - self.index_allocator.free(SlotId(allocation_index.0)); + self.stripes[stripe_index] + .allocator + .free(SlotId(striped_allocation_index.0)); Err(e) } } @@ -297,7 +447,11 @@ impl MemoryPool { self.return_memory_image_slot(allocation_index, image); } - self.index_allocator.free(SlotId(allocation_index.0)); + let stripe_index = allocation_index.index() % self.stripes.len(); + let stripe_slot = allocation_index.0 / self.stripes.len() as u32; + self.stripes[stripe_index] + .allocator + .free(SlotId(stripe_slot)); } /// Purging everything related to `module`. @@ -323,23 +477,25 @@ impl MemoryPool { // associated with a module (not just module and memory). The latter // would require care to make sure that its maintenance wouldn't be too // expensive for normal allocation/free operations. - for i in 0..self.memories_per_instance { - use wasmtime_environ::EntityRef; - let memory_index = DefinedMemoryIndex::new(i); - while let Some(id) = self - .index_allocator - .alloc_affine_and_clear_affinity(module, memory_index) - { - // Clear the image from the slot and, if successful, return it back - // to our state. Note that on failure here the whole slot will get - // paved over with an anonymous mapping. - let index = MemoryAllocationIndex(id.0); - let mut slot = self.take_memory_image_slot(index); - if slot.remove_image().is_ok() { - self.return_memory_image_slot(index, slot); - } + for stripe in &self.stripes { + for i in 0..self.memories_per_instance { + use wasmtime_environ::EntityRef; + let memory_index = DefinedMemoryIndex::new(i); + while let Some(id) = stripe + .allocator + .alloc_affine_and_clear_affinity(module, memory_index) + { + // Clear the image from the slot and, if successful, return it back + // to our state. Note that on failure here the whole slot will get + // paved over with an anonymous mapping. + let index = MemoryAllocationIndex(id.0); + let mut slot = self.take_memory_image_slot(index); + if slot.remove_image().is_ok() { + self.return_memory_image_slot(index, slot); + } - self.index_allocator.free(id); + stripe.allocator.free(id); + } } } } @@ -347,7 +503,7 @@ impl MemoryPool { fn get_base(&self, allocation_index: MemoryAllocationIndex) -> *mut u8 { assert!(allocation_index.index() < self.max_total_memories); let offset = - self.initial_memory_offset + allocation_index.index() * self.memory_and_guard_size; + self.pre_slab_guard_size + allocation_index.index() * self.memory_and_guard_size; unsafe { self.mapping.as_ptr().offset(offset as isize).cast_mut() } } @@ -393,10 +549,146 @@ impl Drop for MemoryPool { } } +/// The index of a memory allocation within an `InstanceAllocator`. +#[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub struct StripedAllocationIndex(u32); + +impl StripedAllocationIndex { + fn as_unstriped_slot_index(self, stripe: usize, num_stripes: usize) -> MemoryAllocationIndex { + let num_stripes: u32 = num_stripes.try_into().unwrap(); + let stripe: u32 = stripe.try_into().unwrap(); + MemoryAllocationIndex(self.0 * num_stripes + stripe) + } +} + +#[derive(Clone, Debug)] +struct SlabConstraints { + max_memory_bytes: usize, + num_memory_slots: usize, + num_pkeys_available: usize, + guard_bytes: usize, + guard_before_slots: bool, +} + +#[derive(Debug)] +struct SlabLayout { + /// The total number of bytes to allocate for the memory pool slab. + total_slab_bytes: usize, + /// If necessary, the number of bytes to reserve as a guard region at the + /// beginning of the slab. + pre_slab_guard_bytes: usize, + /// If necessary, the number of bytes to reserve as a guard region at the + /// beginning of the slab. + post_slab_guard_bytes: usize, + /// The size of each slot in the memory pool; this comprehends the maximum + /// memory size (i.e., from WebAssembly or Wasmtime configuration) plus any + /// guard region after the memory to catch OOB access. On these guard + /// regions, note that: + /// - users can configure how aggressively (or not) to elide bounds checks + /// via `Config::static_memory_guard_size` + /// - memory protection keys can compress the size of the guard region by + /// placing slots from a different key (i.e., a stripe) in the guard + /// region + slot_bytes: usize, + /// The number of stripes needed in the slab layout. + num_stripes: usize, +} + +fn calculate(constraints: &SlabConstraints) -> Result { + let SlabConstraints { + max_memory_bytes, + num_memory_slots, + num_pkeys_available, + guard_bytes, + guard_before_slots, + } = *constraints; + + // If the user specifies a guard region, we always need to allocate a + // `PROT_NONE` region for it before any memory slots. Recall that we can + // avoid bounds checks for loads and stores with immediates up to + // `guard_bytes`, but we rely on Wasmtime to emit bounds checks for any + // accesses greater than this. + let pre_slab_guard_bytes = if guard_before_slots { guard_bytes } else { 0 }; + + let (num_stripes, needed_guard_bytes) = if guard_bytes == 0 || max_memory_bytes == 0 { + // In the uncommon case where the memory or guard regions are empty, we + // will not need any stripes: we just lay out the slots back-to-back + // using a single stripe. + (1, guard_bytes) + } else if num_pkeys_available < 2 { + // If we do not have enough protection keys to stripe the memory, we do + // the same. We can't elide any of the guard bytes because we aren't + // overlapping guard regions with other stripes... + (1, guard_bytes) + } else if num_memory_slots == 0 { + (1, guard_bytes) + } else { + // ...but if we can create at least two stripes, we can use another + // stripe (i.e., a different pkey) as this slot's guard region--this + // reduces the guard bytes each slot has to allocate. We must make sure, + // though, that if the size of that other stripe(s) does not fully cover + // `guard_bytes`, we keep those around to prevent OOB access. + // + // We first calculate the number of stripes we need: we want to minimize + // this so that there is less chance of a single store running out of + // slots with its stripe--we need at least two, though. But this is not + // just an optimization; we need to handle the case when there are fewer + // slots than stripes. E.g., if our pool is configured with only three + // slots (`num_memory_slots = 3`), we will run into failures if we + // attempt to set up more than three stripes. + let needed_num_stripes = + guard_bytes / max_memory_bytes + usize::from(guard_bytes % max_memory_bytes != 0) + 1; + let num_stripes = num_pkeys_available + .min(needed_num_stripes) + .min(num_memory_slots); + let next_slots_overlapping_bytes = max_memory_bytes + .checked_mul(num_stripes - 1) + .unwrap_or(usize::MAX); + let needed_guard_bytes = guard_bytes + .checked_sub(next_slots_overlapping_bytes) + .unwrap_or(0); + (num_stripes, needed_guard_bytes) + }; + + // The page-aligned slot size; equivalent to `memory_and_guard_size`. + let page_alignment = crate::page_size() - 1; + let slot_bytes = max_memory_bytes + .checked_add(needed_guard_bytes) + .and_then(|slot_bytes| slot_bytes.checked_add(page_alignment)) + .and_then(|slot_bytes| Some(slot_bytes & !page_alignment)) + .ok_or_else(|| anyhow!("slot size is too large"))?; + + // We may need another guard region (like `pre_slab_guard_bytes`) at the end + // of our slab. We could be conservative and just create it as large as + // `guard_bytes`, but because we know that the last slot already has a + // region as large as `needed_guard_bytes`, we can reduce the final guard + // region by that much. + let post_slab_guard_bytes = guard_bytes - needed_guard_bytes; + + // The final layout (where `n = num_memory_slots`): + // ┌────────────────────┬──────┬──────┬───┬──────┬─────────────────────┐ + // │pre_slab_guard_bytes│slot 1│slot 2│...│slot n│post_slab_guard_bytes│ + // └────────────────────┴──────┴──────┴───┴──────┴─────────────────────┘ + let total_slab_bytes = slot_bytes + .checked_mul(num_memory_slots) + .and_then(|c| c.checked_add(pre_slab_guard_bytes)) + .and_then(|c| c.checked_add(post_slab_guard_bytes)) + .ok_or_else(|| anyhow!("total size of memory reservation exceeds addressable memory"))?; + + Ok(SlabLayout { + total_slab_bytes, + pre_slab_guard_bytes, + post_slab_guard_bytes, + slot_bytes, + num_stripes, + }) +} + #[cfg(test)] mod tests { use super::*; use crate::{InstanceLimits, PoolingInstanceAllocator}; + use proptest::prelude::*; use wasmtime_environ::WASM_PAGE_SIZE; #[cfg(target_pointer_width = "64")] @@ -457,4 +749,141 @@ mod tests { .unwrap(); assert_eq!(pool.memories.memory_size, 2 * 65536); } + + #[test] + fn test_pooling_allocator_striping() { + if !mpk::is_supported() { + println!("skipping `test_pooling_allocator_striping` test; mpk is not supported"); + return; + } + + // Force the use of MPK. + let config = PoolingInstanceAllocatorConfig { + memory_protection_keys: AutoEnabled::Enable, + ..PoolingInstanceAllocatorConfig::default() + }; + let pool = MemoryPool::new(&config, &Tunables::default()).unwrap(); + assert!(pool.stripes.len() >= 2); + + let max_memory_slots = config.limits.total_memories; + dbg!(pool.stripes[0].allocator.num_empty_slots()); + dbg!(pool.stripes[1].allocator.num_empty_slots()); + let available_memory_slots: usize = pool + .stripes + .iter() + .map(|s| s.allocator.num_empty_slots()) + .sum(); + assert_eq!(max_memory_slots, available_memory_slots.try_into().unwrap()); + } + + #[test] + fn check_known_layout_calculations() { + for num_pkeys_available in 0..16 { + for num_memory_slots in [0, 1, 10, 64] { + for max_memory_bytes in + [0, 1 * WASM_PAGE_SIZE as usize, 10 * WASM_PAGE_SIZE as usize] + { + for guard_bytes in [0, 2 << 30 /* 2GB */] { + for guard_before_slots in [true, false] { + let constraints = SlabConstraints { + max_memory_bytes, + num_memory_slots, + num_pkeys_available, + guard_bytes, + guard_before_slots, + }; + let layout = calculate(&constraints); + assert_slab_layout_invariants(constraints, layout.unwrap()); + } + } + } + } + } + } + + proptest! { + #[test] + fn check_random_layout_calculations(c in constraints()) { + if let Ok(l) = calculate(&c) { + assert_slab_layout_invariants(c, l); + } + } + } + + fn constraints() -> impl Strategy { + ( + any::(), + any::(), + any::(), + any::(), + any::(), + ) + .prop_map( + |( + max_memory_bytes, + num_memory_slots, + num_pkeys_available, + guard_bytes, + guard_before_slots, + )| { + SlabConstraints { + max_memory_bytes, + num_memory_slots, + num_pkeys_available, + guard_bytes, + guard_before_slots, + } + }, + ) + } + + fn assert_slab_layout_invariants(c: SlabConstraints, s: SlabLayout) { + // Check that all the sizes add up. + assert_eq!( + s.total_slab_bytes, + s.pre_slab_guard_bytes + s.slot_bytes * c.num_memory_slots + s.post_slab_guard_bytes, + "the slab size does not add up: {c:?} => {s:?}" + ); + + // Check that the memory slot size is page-aligned. + assert!( + s.slot_bytes % crate::page_size() == 0, + "slot size is not page-aligned: {c:?} => {s:?}", + ); + + // Check that we use no more or less stripes than needed. + assert!(s.num_stripes >= 1, "not enough stripes: {c:?} => {s:?}"); + if c.num_pkeys_available == 0 || c.num_memory_slots == 0 { + assert_eq!( + s.num_stripes, 1, + "expected at least one stripe: {c:?} => {s:?}" + ); + } else { + assert!( + s.num_stripes <= c.num_pkeys_available, + "layout has more stripes than available pkeys: {c:?} => {s:?}" + ); + assert!( + s.num_stripes <= c.num_memory_slots, + "layout has more stripes than memory slots: {c:?} => {s:?}" + ); + } + + // Check that we use the minimum number of stripes/protection keys. + // - if the next slot is bigger + if c.num_pkeys_available > 1 && c.max_memory_bytes > 0 { + assert!( + s.num_stripes <= (c.guard_bytes / c.max_memory_bytes) + 2, + "calculated more stripes than needed: {c:?} => {s:?}" + ); + } + + // Check that the memory-striping will not allow OOB access. + if s.num_stripes > 1 { + assert!( + s.slot_bytes * (s.num_stripes - 1) >= c.guard_bytes, + "layout may allow OOB access: {c:?} => {s:?}" + ); + } + } } diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index 2f657e462cd6..7dc15c020188 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -1183,6 +1183,10 @@ impl Config { /// always be static memories, they are never dynamic. This setting /// configures the size of linear memory to reserve for each memory in the /// pooling allocator. + /// + /// Note that the pooling allocator can reduce the amount of memory needed + /// for pooling allocation by using memory protection; see + /// [`Config::memory_protection_keys`] for details. pub fn static_memory_maximum_size(&mut self, max_size: u64) -> &mut Self { let max_pages = max_size / u64::from(wasmtime_environ::WASM_PAGE_SIZE); self.tunables.static_memory_bound = max_pages; @@ -1195,7 +1199,7 @@ impl Config { /// linear memories created within this `Config`. This means that all /// memories will be allocated up-front and will never move. Additionally /// this means that all memories are synthetically limited by the - /// [`Config::static_memory_maximum_size`] option, irregardless of what the + /// [`Config::static_memory_maximum_size`] option, regardless of what the /// actual maximum size is on the memory's original type. /// /// For the difference between static and dynamic memories, see the @@ -1237,7 +1241,7 @@ impl Config { /// immediate offsets will generate bounds checks based on how big the guard /// page is. /// - /// For 32-bit memories a 4GB static memory is required to even start + /// For 32-bit wasm memories a 4GB static memory is required to even start /// removing bounds checks. A 4GB guard size will guarantee that the module /// has zero bounds checks for memory accesses. A 2GB guard size will /// eliminate all bounds checks with an immediate offset less than 2GB. A @@ -2303,6 +2307,34 @@ impl PoolingAllocationConfig { self.config.limits.memory_pages = pages; self } + + /// Configures whether memory protection keys (MPK) should be used for more + /// efficient layout of pool-allocated memories. + /// + /// When using the pooling allocator (see [`Config::allocation_strategy`], + /// [`InstanceAllocationStrategy::Pooling`]), memory protection keys can + /// reduce the total amount of allocated memory by eliminating guard regions + /// between WebAssembly memories in the pool. It does so by "coloring" + /// memory regions with different memory keys and setting which regions are + /// accessible each time executions switches from host to guest (or vice + /// versa). + /// + /// MPK is only available on Linux (called `pku` there) and recent x86 + /// systems. Checking for support at runtime is possible with + /// [`mpk::is_supported`][wasmtime_runtime::mpk::is_supported]. This + /// configuration setting can be in three states: + /// + /// - `auto`: if MPK support is available the guard regions are removed; if + /// not, the guard regions remain + /// - `enable`: use MPK to eliminate guard regions; fail if MPK is not + /// supported + /// - `disable`: never use MPK + /// + /// By default this value is `auto`. + pub fn memory_protection_keys(&mut self, enable: wasmtime_runtime::AutoEnabled) -> &mut Self { + self.config.memory_protection_keys = enable; + self + } } pub(crate) fn probestack_supported(arch: Architecture) -> bool { diff --git a/crates/wasmtime/src/instance.rs b/crates/wasmtime/src/instance.rs index 0adb96c8d2ed..af070727af1b 100644 --- a/crates/wasmtime/src/instance.rs +++ b/crates/wasmtime/src/instance.rs @@ -284,6 +284,7 @@ impl Instance { host_state: Box::new(Instance(instance_to_be)), store: StorePtr::new(store.traitobj()), wmemcheck: store.engine().config().wmemcheck, + pkey: store.get_pkey(), })?; // The instance still has lots of setup, for example diff --git a/crates/wasmtime/src/store.rs b/crates/wasmtime/src/store.rs index d6f2bc574475..48436c04f639 100644 --- a/crates/wasmtime/src/store.rs +++ b/crates/wasmtime/src/store.rs @@ -96,9 +96,9 @@ use std::sync::atomic::AtomicU64; use std::sync::Arc; use std::task::{Context, Poll}; use wasmtime_runtime::{ - ExportGlobal, InstanceAllocationRequest, InstanceAllocator, InstanceHandle, ModuleInfo, - OnDemandInstanceAllocator, SignalHandler, StoreBox, StorePtr, VMContext, VMExternRef, - VMExternRefActivationsTable, VMFuncRef, VMRuntimeLimits, WasmFault, + mpk::ProtectionKey, ExportGlobal, InstanceAllocationRequest, InstanceAllocator, InstanceHandle, + ModuleInfo, OnDemandInstanceAllocator, SignalHandler, StoreBox, StorePtr, VMContext, + VMExternRef, VMExternRefActivationsTable, VMFuncRef, VMRuntimeLimits, WasmFault, }; mod context; @@ -343,6 +343,11 @@ pub struct StoreOpaque { /// `store_data` above, where the function pointers are stored. rooted_host_funcs: ManuallyDrop>>, + /// Keep track of what protection key is being used during allocation so + /// that the right memory pages can be enabled when entering WebAssembly + /// guest code. + pkey: Option, + /// Runtime state for components used in the handling of resources, borrow, /// and calls. These also interact with the `ResourceAny` type and its /// internal representation. @@ -473,6 +478,7 @@ impl Store { /// tables created to 10,000. This can be overridden with the /// [`Store::limiter`] configuration method. pub fn new(engine: &Engine, data: T) -> Self { + let pkey = engine.allocator().next_available_pkey(); let mut inner = Box::new(StoreInner { inner: StoreOpaque { _marker: marker::PhantomPinned, @@ -504,6 +510,7 @@ impl Store { hostcall_val_storage: Vec::new(), wasm_val_raw_storage: Vec::new(), rooted_host_funcs: ManuallyDrop::new(Vec::new()), + pkey, #[cfg(feature = "component-model")] component_host_table: Default::default(), #[cfg(feature = "component-model")] @@ -537,6 +544,7 @@ impl Store { store: StorePtr::empty(), runtime_info: &shim, wmemcheck: engine.config().wmemcheck, + pkey: None, }) .expect("failed to allocate default callee") }; @@ -1164,6 +1172,16 @@ impl StoreInner { } pub fn call_hook(&mut self, s: CallHook) -> Result<()> { + if let Some(pkey) = &self.inner.pkey { + let allocator = self.engine().allocator(); + match s { + CallHook::CallingWasm | CallHook::ReturningFromHost => { + allocator.restrict_to_pkey(*pkey) + } + CallHook::ReturningFromWasm | CallHook::CallingHost => allocator.allow_all_pkeys(), + } + } + match &mut self.call_hook { Some(CallHookInner::Sync(hook)) => hook(&mut self.data, s), @@ -1669,6 +1687,12 @@ at https://bytecodealliance.org/security. std::process::abort(); } + /// Retrieve the store's protection key. + #[inline] + pub(crate) fn get_pkey(&self) -> Option { + self.pkey.clone() + } + #[inline] #[cfg(feature = "component-model")] pub(crate) fn component_calls_and_host_table( diff --git a/crates/wasmtime/src/trampoline.rs b/crates/wasmtime/src/trampoline.rs index 3822f6027777..7f2499fb3ef2 100644 --- a/crates/wasmtime/src/trampoline.rs +++ b/crates/wasmtime/src/trampoline.rs @@ -48,6 +48,7 @@ fn create_handle( store: StorePtr::new(store.traitobj()), runtime_info, wmemcheck: false, + pkey: None, })?; Ok(store.add_dummy_instance(handle)) diff --git a/crates/wasmtime/src/trampoline/memory.rs b/crates/wasmtime/src/trampoline/memory.rs index bc1423991783..72c7eb891f81 100644 --- a/crates/wasmtime/src/trampoline/memory.rs +++ b/crates/wasmtime/src/trampoline/memory.rs @@ -10,6 +10,7 @@ use wasmtime_environ::{ DefinedMemoryIndex, DefinedTableIndex, EntityIndex, HostPtr, MemoryPlan, MemoryStyle, Module, VMOffsets, WASM_PAGE_SIZE, }; +use wasmtime_runtime::mpk::ProtectionKey; use wasmtime_runtime::{ CompiledModuleId, Imports, InstanceAllocationRequest, InstanceAllocator, InstanceAllocatorImpl, Memory, MemoryAllocationIndex, MemoryImage, OnDemandInstanceAllocator, RuntimeLinearMemory, @@ -63,6 +64,7 @@ pub fn create_memory( store: StorePtr::new(store.traitobj()), runtime_info, wmemcheck: false, + pkey: None, }; unsafe { @@ -252,4 +254,16 @@ unsafe impl InstanceAllocatorImpl for SingleMemoryInstance<'_> { fn purge_module(&self, _: CompiledModuleId) { unreachable!() } + + fn next_available_pkey(&self) -> Option { + unreachable!() + } + + fn restrict_to_pkey(&self, _: ProtectionKey) { + unreachable!() + } + + fn allow_all_pkeys(&self) { + unreachable!() + } } diff --git a/tests/all/pooling_allocator.rs b/tests/all/pooling_allocator.rs index f6fcc2a541c0..dc23686de885 100644 --- a/tests/all/pooling_allocator.rs +++ b/tests/all/pooling_allocator.rs @@ -1169,7 +1169,7 @@ fn total_memories_limit() -> Result<()> { Err(e) => assert_eq!( e.to_string(), format!( - "maximum concurrent memory limit of {} reached", + "maximum concurrent memory limit of {} reached for stripe 0", TOTAL_MEMORIES ), ), From f5179be7123e43d9c8f4492290987776f3474df8 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Wed, 20 Sep 2023 16:58:58 -0700 Subject: [PATCH 3/3] Address review comments This is a rollup of 43 commits addressing review comments of various kinds: bug fixes, refactorings, documentation improvements, etc. It also ensures that CI runs all checks. A big thanks to @fitzgen and @alexcrichton for the review! prtest:full Co-authored-by: Nick Fitzgerald Co-authored-by: Alex Crichton --- Cargo.lock | 23 +- crates/runtime/Cargo.toml | 2 +- crates/runtime/src/instance/allocator.rs | 4 +- .../src/instance/allocator/on_demand.rs | 10 +- .../runtime/src/instance/allocator/pooling.rs | 18 +- .../instance/allocator/pooling/memory_pool.rs | 459 +++++++++++------- crates/runtime/src/lib.rs | 2 + crates/runtime/src/mpk/disabled.rs | 6 +- crates/runtime/src/mpk/enabled.rs | 90 ++-- crates/runtime/src/mpk/mod.rs | 49 +- crates/runtime/src/mpk/pkru.rs | 20 +- crates/runtime/src/mpk/sys.rs | 34 +- crates/wasmtime/src/config.rs | 39 +- 13 files changed, 448 insertions(+), 308 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4fad9916057b..3597329678f8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1470,7 +1470,7 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" dependencies = [ - "quick-error", + "quick-error 1.2.3", ] [[package]] @@ -1843,7 +1843,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", - "libm", ] [[package]] @@ -2037,22 +2036,22 @@ dependencies = [ [[package]] name = "proptest" -version = "1.2.0" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e35c06b98bf36aba164cc17cb25f7e232f5c4aeea73baa14b8a9f0d92dbfa65" +checksum = "1e0d9cc07f18492d879586c92b485def06bc850da3118075cd45d50e9c95b0e5" dependencies = [ "bit-set", "bitflags 1.3.2", "byteorder", "lazy_static", "num-traits", + "quick-error 2.0.1", "rand", "rand_chacha", "rand_xorshift", "regex-syntax 0.6.25", "rusty-fork", "tempfile", - "unarray", ] [[package]] @@ -2081,6 +2080,12 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quote" version = "1.0.29" @@ -2344,7 +2349,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" dependencies = [ "fnv", - "quick-error", + "quick-error 1.2.3", "tempfile", "wait-timeout", ] @@ -2854,12 +2859,6 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" -[[package]] -name = "unarray" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" - [[package]] name = "unicase" version = "2.6.0" diff --git a/crates/runtime/Cargo.toml b/crates/runtime/Cargo.toml index 63fb6bc23bef..0ff9f69ae8c1 100644 --- a/crates/runtime/Cargo.toml +++ b/crates/runtime/Cargo.toml @@ -50,7 +50,7 @@ features = [ [dev-dependencies] once_cell = { workspace = true } -proptest = "1.2.0" +proptest = "1.0.0" [build-dependencies] cc = "1.0" diff --git a/crates/runtime/src/instance/allocator.rs b/crates/runtime/src/instance/allocator.rs index a8cb2b5d0107..5205d87c3444 100644 --- a/crates/runtime/src/instance/allocator.rs +++ b/crates/runtime/src/instance/allocator.rs @@ -24,9 +24,7 @@ pub use self::on_demand::OnDemandInstanceAllocator; #[cfg(feature = "pooling-allocator")] mod pooling; #[cfg(feature = "pooling-allocator")] -pub use self::pooling::{ - AutoEnabled, InstanceLimits, PoolingInstanceAllocator, PoolingInstanceAllocatorConfig, -}; +pub use self::pooling::{InstanceLimits, PoolingInstanceAllocator, PoolingInstanceAllocatorConfig}; /// Represents a request for a new runtime instance. pub struct InstanceAllocationRequest<'a> { diff --git a/crates/runtime/src/instance/allocator/on_demand.rs b/crates/runtime/src/instance/allocator/on_demand.rs index 3874affb7ccc..935074729fd3 100644 --- a/crates/runtime/src/instance/allocator/on_demand.rs +++ b/crates/runtime/src/instance/allocator/on_demand.rs @@ -161,10 +161,16 @@ unsafe impl InstanceAllocatorImpl for OnDemandInstanceAllocator { } fn restrict_to_pkey(&self, _: ProtectionKey) { - // The on-demand allocator cannot use protection keys. + // The on-demand allocator cannot use protection keys; an on-demand + // allocator will never hand out protection keys to the stores its + // engine creates. + unreachable!() } fn allow_all_pkeys(&self) { - // The on-demand allocator cannot use protection keys. + // The on-demand allocator cannot use protection keys; an on-demand + // allocator will never hand out protection keys to the stores its + // engine creates. + unreachable!() } } diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs index 202fc1850408..7d00a59052bb 100644 --- a/crates/runtime/src/instance/allocator/pooling.rs +++ b/crates/runtime/src/instance/allocator/pooling.rs @@ -80,7 +80,7 @@ use super::{ }; use crate::{ instance::Instance, - mpk::{self, ProtectionKey, ProtectionMask}, + mpk::{self, MpkEnabled, ProtectionKey, ProtectionMask}, CompiledModuleId, Memory, Table, }; use anyhow::{bail, Result}; @@ -218,7 +218,7 @@ pub struct PoolingInstanceAllocatorConfig { /// Same as `linear_memory_keep_resident` but for tables. pub table_keep_resident: usize, /// Whether to enable memory protection keys. - pub memory_protection_keys: AutoEnabled, + pub memory_protection_keys: MpkEnabled, } impl Default for PoolingInstanceAllocatorConfig { @@ -231,23 +231,11 @@ impl Default for PoolingInstanceAllocatorConfig { async_stack_keep_resident: 0, linear_memory_keep_resident: 0, table_keep_resident: 0, - memory_protection_keys: AutoEnabled::Disable, + memory_protection_keys: MpkEnabled::Disable, } } } -/// Describe the tri-state configuration of memory protection keys (MPK). -#[derive(Clone, Copy, Debug)] -pub enum AutoEnabled { - /// Use MPK if supported by the current system; fall back to guard regions - /// otherwise. - Auto, - /// Use MPK or fail if not supported. - Enable, - /// Do not use MPK. - Disable, -} - /// Implements the pooling instance allocator. /// /// This allocator internally maintains pools of instances, memories, tables, diff --git a/crates/runtime/src/instance/allocator/pooling/memory_pool.rs b/crates/runtime/src/instance/allocator/pooling/memory_pool.rs index 8197386c3afa..47455a0e3558 100644 --- a/crates/runtime/src/instance/allocator/pooling/memory_pool.rs +++ b/crates/runtime/src/instance/allocator/pooling/memory_pool.rs @@ -64,8 +64,8 @@ use super::{ }; use crate::mpk::{self, ProtectionKey, ProtectionMask}; use crate::{ - AutoEnabled, CompiledModuleId, InstanceAllocationRequest, Memory, MemoryImageSlot, Mmap, - PoolingInstanceAllocatorConfig, + CompiledModuleId, InstanceAllocationRequest, InstanceLimits, Memory, MemoryImageSlot, Mmap, + MpkEnabled, PoolingInstanceAllocatorConfig, }; use anyhow::{anyhow, bail, Context, Result}; use libc::c_void; @@ -97,21 +97,17 @@ struct Stripe { /// A diagram for this struct's fields is: /// /// ```text -/// memory_size -/// / -/// max_accessible / memory_and_guard_size -/// | / | -/// <--+---> / <-----------+----------> -/// <--------+-> -/// -/// +-----------+--------+---+-----------+ +--------+---+-----------+ -/// | PROT_NONE | | PROT_NONE | ... | | PROT_NONE | -/// +-----------+--------+---+-----------+ +--------+---+-----------+ -/// | |<------------------+----------------------------------> -/// \ | \ -/// mapping | `max_total_memories` memories +/// layout.max_memory_bytes layout.slot_bytes +/// | | +/// <-----+----> <-----------+-----------> +/// +-----------+------------+-----------+ +--------+---+-----------+-----------+ +/// | PROT_NONE | | PROT_NONE | ... | | PROT_NONE | PROT_NONE | +/// +-----------+------------+-----------+ +--------+---+-----------+-----------+ +/// | |<------------------+----------------------------------> <----+----> +/// \ | \ | +/// mapping | `layout.num_slots` memories layout.post_slab_guard_size /// / -/// pre_slab_guard_size +/// layout.pre_slab_guard_size /// ``` #[derive(Debug)] pub struct MemoryPool { @@ -124,22 +120,9 @@ pub struct MemoryPool { // dynamically transfer ownership of a slot to a Memory when in // use. image_slots: Vec>>, - // The size, in bytes, of each linear memory's reservation, not including - // any guard region. - memory_size: usize, - // The size, in bytes, of each linear memory's reservation plus the trailing - // guard region allocated for it. - memory_and_guard_size: usize, - // The maximum size that can become accessible, in bytes, of each linear - // memory. Guaranteed to be a whole number of wasm pages. - max_accessible: usize, - // The size, in bytes, of the offset to the first linear memory in this - // pool. This is here to help account for the first region of guard pages, - // if desired, before the first linear memory. - pre_slab_guard_size: usize, - // The maximum number of memories that can be allocated concurrently, aka - // our pool's capacity. - max_total_memories: usize, + /// A description of the various memory sizes used in allocating the + /// `mapping` slab. + layout: SlabLayout, // The maximum number of memories that a single core module instance may // use. // @@ -171,21 +154,21 @@ impl MemoryPool { } let pkeys = match config.memory_protection_keys { - AutoEnabled::Auto => { + MpkEnabled::Auto => { if mpk::is_supported() { mpk::keys() } else { &[] } } - AutoEnabled::Enable => { + MpkEnabled::Enable => { if mpk::is_supported() { mpk::keys() } else { bail!("mpk is disabled on this system") } } - AutoEnabled::Disable => &[], + MpkEnabled::Disable => &[], }; // This is a tricky bit of global state: when creating a memory pool @@ -202,27 +185,12 @@ impl MemoryPool { mpk::allow(ProtectionMask::all()); } - // Interpret the larger of the maximal size of memory or the static - // memory bound as the size of the virtual address space reservation for - // memory itself. Typically `static_memory_bound` is 4G which helps - // elide most bounds checks in wasm. If `memory_pages` is larger, - // though, then this is a non-moving pooling allocator so we create - // larger reservations to account for that. - let max_memory_bytes = config.limits.memory_pages.max(tunables.static_memory_bound) - * u64::from(WASM_PAGE_SIZE); - - // Create a slab layout and allocate it as completely inaccessible + // Create a slab layout and allocate it as a completely inaccessible // region to start--`PROT_NONE`. - let constraints = SlabConstraints { - max_memory_bytes: max_memory_bytes as usize, - num_memory_slots: config.limits.total_memories as usize, - num_pkeys_available: pkeys.len(), - guard_bytes: tunables.static_memory_offset_guard_size as usize, - guard_before_slots: tunables.guard_before_linear_memory, - }; + let constraints = SlabConstraints::new(&config.limits, tunables, pkeys.len())?; let layout = calculate(&constraints)?; log::debug!("creating memory pool: {constraints:?} -> {layout:?}"); - let mut mapping = Mmap::accessible_reserved(0, layout.total_slab_bytes) + let mut mapping = Mmap::accessible_reserved(0, layout.total_slab_bytes()?) .context("failed to create memory pool mapping")?; // Then, stripe the memory with the available protection keys. This is @@ -230,7 +198,7 @@ impl MemoryPool { if layout.num_stripes >= 2 { let mut cursor = layout.pre_slab_guard_bytes; let pkeys = &pkeys[..layout.num_stripes]; - for i in 0..constraints.num_memory_slots { + for i in 0..constraints.num_slots { let pkey = &pkeys[i % pkeys.len()]; let region = unsafe { mapping.slice_mut(cursor..cursor + layout.slot_bytes) }; pkey.protect(region)?; @@ -238,17 +206,17 @@ impl MemoryPool { } debug_assert_eq!( cursor + layout.post_slab_guard_bytes, - layout.total_slab_bytes + layout.total_slab_bytes()? ); } let image_slots: Vec<_> = std::iter::repeat_with(|| Mutex::new(None)) - .take(constraints.num_memory_slots) + .take(constraints.num_slots) .collect(); let create_stripe = |i| { - let num_slots = constraints.num_memory_slots / layout.num_stripes - + usize::from(constraints.num_memory_slots % layout.num_stripes > i); + let num_slots = constraints.num_slots / layout.num_stripes + + usize::from(constraints.num_slots % layout.num_stripes > i); let allocator = ModuleAffinityIndexAllocator::new( num_slots.try_into().unwrap(), config.max_unused_warm_slots, @@ -269,12 +237,8 @@ impl MemoryPool { stripes, mapping, image_slots, - memory_size: constraints.max_memory_bytes, - memory_and_guard_size: layout.slot_bytes, - pre_slab_guard_size: layout.pre_slab_guard_bytes, - max_total_memories: constraints.num_memory_slots, + layout, memories_per_instance: usize::try_from(config.limits.max_memories_per_module).unwrap(), - max_accessible: (config.limits.memory_pages as usize) * (WASM_PAGE_SIZE as usize), keep_resident: config.linear_memory_keep_resident, next_available_pkey: AtomicUsize::new(0), }; @@ -303,6 +267,7 @@ impl MemoryPool { ); } + let max_memory_pages = self.layout.max_memory_bytes / WASM_PAGE_SIZE as usize; for (i, plan) in module .memory_plans .iter() @@ -310,7 +275,7 @@ impl MemoryPool { { match plan.style { MemoryStyle::Static { bound } => { - if u64::try_from(self.memory_size).unwrap() < bound { + if u64::try_from(self.layout.pages_to_next_stripe_slot()).unwrap() < bound { bail!( "memory size allocated per-memory is too small to \ satisfy static bound of {bound:#x} pages" @@ -319,13 +284,12 @@ impl MemoryPool { } MemoryStyle::Dynamic { .. } => {} } - let max = self.max_accessible / (WASM_PAGE_SIZE as usize); - if plan.memory.minimum > u64::try_from(max).unwrap() { + if plan.memory.minimum > u64::try_from(max_memory_pages).unwrap() { bail!( "memory index {} has a minimum page size of {} which exceeds the limit of {}", i.as_u32(), plan.memory.minimum, - max, + max_memory_pages, ); } } @@ -334,12 +298,7 @@ impl MemoryPool { /// Are zero slots in use right now? pub fn is_empty(&self) -> bool { - for stripe in &self.stripes { - if !stripe.allocator.is_empty() { - return false; - } - } - true + self.stripes.iter().all(|s| s.allocator.is_empty()) } /// Allocate a single memory for the given instance allocation request. @@ -383,13 +342,13 @@ impl MemoryPool { match memory_plan.style { MemoryStyle::Static { bound } => { let bound = bound * u64::from(WASM_PAGE_SIZE); - assert!(bound <= u64::try_from(self.memory_size).unwrap()); + assert!(bound <= u64::try_from(self.layout.slot_bytes).unwrap()); } MemoryStyle::Dynamic { .. } => {} } let base_ptr = self.get_base(allocation_index); - let base_capacity = self.max_accessible; + let base_capacity = self.layout.max_memory_bytes; let mut slot = self.take_memory_image_slot(allocation_index); let image = request.runtime_info.memory_image(memory_index)?; @@ -415,7 +374,7 @@ impl MemoryPool { base_ptr, base_capacity, slot, - self.memory_and_guard_size, + self.layout.slot_bytes, unsafe { &mut *request.store.get().unwrap() }, ) })() { @@ -447,11 +406,11 @@ impl MemoryPool { self.return_memory_image_slot(allocation_index, image); } - let stripe_index = allocation_index.index() % self.stripes.len(); - let stripe_slot = allocation_index.0 / self.stripes.len() as u32; + let (stripe_index, striped_allocation_index) = + StripedAllocationIndex::from_unstriped_slot_index(allocation_index, self.stripes.len()); self.stripes[stripe_index] .allocator - .free(SlotId(stripe_slot)); + .free(SlotId(striped_allocation_index.0)); } /// Purging everything related to `module`. @@ -501,9 +460,9 @@ impl MemoryPool { } fn get_base(&self, allocation_index: MemoryAllocationIndex) -> *mut u8 { - assert!(allocation_index.index() < self.max_total_memories); + assert!(allocation_index.index() < self.layout.num_slots); let offset = - self.pre_slab_guard_size + allocation_index.index() * self.memory_and_guard_size; + self.layout.pre_slab_guard_bytes + allocation_index.index() * self.layout.slot_bytes; unsafe { self.mapping.as_ptr().offset(offset as isize).cast_mut() } } @@ -519,7 +478,7 @@ impl MemoryPool { MemoryImageSlot::create( self.get_base(allocation_index) as *mut c_void, 0, - self.max_accessible, + self.layout.max_memory_bytes, ) }) } @@ -554,6 +513,16 @@ impl Drop for MemoryPool { pub struct StripedAllocationIndex(u32); impl StripedAllocationIndex { + fn from_unstriped_slot_index( + index: MemoryAllocationIndex, + num_stripes: usize, + ) -> (usize, Self) { + let stripe_index = index.index() % num_stripes; + let num_stripes: u32 = num_stripes.try_into().unwrap(); + let index_within_stripe = Self(index.0 / num_stripes); + (stripe_index, index_within_stripe) + } + fn as_unstriped_slot_index(self, stripe: usize, num_stripes: usize) -> MemoryAllocationIndex { let num_stripes: u32 = num_stripes.try_into().unwrap(); let stripe: u32 = stripe.try_into().unwrap(); @@ -563,24 +532,64 @@ impl StripedAllocationIndex { #[derive(Clone, Debug)] struct SlabConstraints { + /// Essentially, the `static_memory_bound`: this is an assumption that the + /// runtime and JIT compiler make about how much space will be guarded + /// between slots. + expected_slot_bytes: usize, + /// The maximum size of any memory in the pool. max_memory_bytes: usize, - num_memory_slots: usize, + num_slots: usize, num_pkeys_available: usize, guard_bytes: usize, guard_before_slots: bool, } +impl SlabConstraints { + fn new( + limits: &InstanceLimits, + tunables: &Tunables, + num_pkeys_available: usize, + ) -> Result { + // The maximum size a memory can grow to in this pool. + let max_memory_bytes = limits.memory_pages * u64::from(WASM_PAGE_SIZE); + + // `static_memory_bound` is the configured number of Wasm pages for a + // static memory slot (see `Config::static_memory_maximum_size`); even + // if the memory never grows to this size (e.g., it has a lower memory + // maximum), codegen will assume that this unused memory is mapped + // `PROT_NONE`. Typically `static_memory_bound` is 4G which helps elide + // most bounds checks. `MemoryPool` must respect this bound, though not + // explicitly: if we can achieve the same effect via MPK-protected + // stripes, the slot size can be lower than the `static_memory_bound`. + let expected_slot_bytes = tunables.static_memory_bound * u64::from(WASM_PAGE_SIZE); + + let constraints = SlabConstraints { + max_memory_bytes: max_memory_bytes + .try_into() + .context("max memory is too large")?, + num_slots: limits + .total_memories + .try_into() + .context("too many memories")?, + expected_slot_bytes: expected_slot_bytes + .try_into() + .context("static memory bound is too large")?, + num_pkeys_available, + guard_bytes: tunables + .static_memory_offset_guard_size + .try_into() + .context("guard region is too large")?, + guard_before_slots: tunables.guard_before_linear_memory, + }; + Ok(constraints) + } +} + #[derive(Debug)] struct SlabLayout { - /// The total number of bytes to allocate for the memory pool slab. - total_slab_bytes: usize, - /// If necessary, the number of bytes to reserve as a guard region at the - /// beginning of the slab. - pre_slab_guard_bytes: usize, - /// If necessary, the number of bytes to reserve as a guard region at the - /// beginning of the slab. - post_slab_guard_bytes: usize, - /// The size of each slot in the memory pool; this comprehends the maximum + /// The total number of slots available in the memory pool slab. + num_slots: usize, + /// The size of each slot in the memory pool; this contains the maximum /// memory size (i.e., from WebAssembly or Wasmtime configuration) plus any /// guard region after the memory to catch OOB access. On these guard /// regions, note that: @@ -588,16 +597,64 @@ struct SlabLayout { /// via `Config::static_memory_guard_size` /// - memory protection keys can compress the size of the guard region by /// placing slots from a different key (i.e., a stripe) in the guard - /// region + /// region; this means the slot itself can be smaller and we can allocate + /// more of them. slot_bytes: usize, + // The maximum size that can become accessible, in bytes, for each linear + // memory. Guaranteed to be a whole number of wasm pages. + max_memory_bytes: usize, + /// The total number of bytes to allocate for the memory pool slab. + // total_slab_bytes: usize, + /// If necessary, the number of bytes to reserve as a guard region at the + /// beginning of the slab. + pre_slab_guard_bytes: usize, + /// Like `pre_slab_guard_bytes`, but at the end of the slab. + post_slab_guard_bytes: usize, /// The number of stripes needed in the slab layout. num_stripes: usize, } +impl SlabLayout { + /// Return the total size of the slab, using the final layout (where `n = + /// num_slots`): + /// + /// ```text + /// ┌────────────────────┬──────┬──────┬───┬──────┬─────────────────────┐ + /// │pre_slab_guard_bytes│slot 1│slot 2│...│slot n│post_slab_guard_bytes│ + /// └────────────────────┴──────┴──────┴───┴──────┴─────────────────────┘ + /// ``` + fn total_slab_bytes(&self) -> Result { + self.slot_bytes + .checked_mul(self.num_slots) + .and_then(|c| c.checked_add(self.pre_slab_guard_bytes)) + .and_then(|c| c.checked_add(self.post_slab_guard_bytes)) + .ok_or_else(|| anyhow!("total size of memory reservation exceeds addressable memory")) + } + + /// Returns the number of Wasm pages from the beginning of one slot to the + /// next slot in the same stripe--this is the striped equivalent of + /// `static_memory_bound`. Recall that between slots of the same stripe we + /// will see a slot from every other stripe. + /// + /// For example, in a 3-stripe pool, this function measures the distance + /// from the beginning of slot 1 to slot 4, which are of the same stripe: + /// + /// ```text + /// ┌────────┬──────┬──────┬────────┬───┐ + /// │*slot 1*│slot 2│slot 3│*slot 4*│...| + /// └────────┴──────┴──────┴────────┴───┘ + /// ``` + fn pages_to_next_stripe_slot(&self) -> usize { + let slot_pages = self.slot_bytes / WASM_PAGE_SIZE as usize; + slot_pages * self.num_stripes + } +} + fn calculate(constraints: &SlabConstraints) -> Result { let SlabConstraints { max_memory_bytes, - num_memory_slots, + num_slots, + expected_slot_bytes, num_pkeys_available, guard_bytes, guard_before_slots, @@ -610,78 +667,88 @@ fn calculate(constraints: &SlabConstraints) -> Result { // accesses greater than this. let pre_slab_guard_bytes = if guard_before_slots { guard_bytes } else { 0 }; - let (num_stripes, needed_guard_bytes) = if guard_bytes == 0 || max_memory_bytes == 0 { - // In the uncommon case where the memory or guard regions are empty, we + // To calculate the slot size, we start with the default configured size and + // attempt to chip away at this via MPK protection. Note here how we begin + // to define a slot as "all of the memory and guard region." + let slot_bytes = expected_slot_bytes + .max(max_memory_bytes) + .checked_add(guard_bytes) + .unwrap_or(usize::MAX); + + let (num_stripes, slot_bytes) = if guard_bytes == 0 || max_memory_bytes == 0 || num_slots == 0 { + // In the uncommon case where the memory/guard regions are empty or we don't need any slots , we // will not need any stripes: we just lay out the slots back-to-back // using a single stripe. - (1, guard_bytes) + (1, slot_bytes) } else if num_pkeys_available < 2 { // If we do not have enough protection keys to stripe the memory, we do // the same. We can't elide any of the guard bytes because we aren't // overlapping guard regions with other stripes... - (1, guard_bytes) - } else if num_memory_slots == 0 { - (1, guard_bytes) + (1, slot_bytes) } else { // ...but if we can create at least two stripes, we can use another // stripe (i.e., a different pkey) as this slot's guard region--this - // reduces the guard bytes each slot has to allocate. We must make sure, - // though, that if the size of that other stripe(s) does not fully cover - // `guard_bytes`, we keep those around to prevent OOB access. - // - // We first calculate the number of stripes we need: we want to minimize - // this so that there is less chance of a single store running out of - // slots with its stripe--we need at least two, though. But this is not - // just an optimization; we need to handle the case when there are fewer - // slots than stripes. E.g., if our pool is configured with only three - // slots (`num_memory_slots = 3`), we will run into failures if we - // attempt to set up more than three stripes. + // reduces the guard bytes each slot has to allocate. We must make + // sure, though, that if the size of that other stripe(s) does not + // fully cover `guard_bytes`, we keep those around to prevent OOB + // access. + + // We first calculate the number of stripes we need: we want to + // minimize this so that there is less chance of a single store + // running out of slots with its stripe--we need at least two, + // though. But this is not just an optimization; we need to handle + // the case when there are fewer slots than stripes. E.g., if our + // pool is configured with only three slots (`num_memory_slots = + // 3`), we will run into failures if we attempt to set up more than + // three stripes. let needed_num_stripes = - guard_bytes / max_memory_bytes + usize::from(guard_bytes % max_memory_bytes != 0) + 1; - let num_stripes = num_pkeys_available - .min(needed_num_stripes) - .min(num_memory_slots); + slot_bytes / max_memory_bytes + usize::from(slot_bytes % max_memory_bytes != 0) + 1; + let num_stripes = num_pkeys_available.min(needed_num_stripes).min(num_slots); + + // Next, we try to reduce the slot size by "overlapping" the + // stripes: we can make slot `n` smaller since we know that slot + // `n+1` and following are in different stripes and will look just + // like `PROT_NONE` memory. let next_slots_overlapping_bytes = max_memory_bytes .checked_mul(num_stripes - 1) .unwrap_or(usize::MAX); - let needed_guard_bytes = guard_bytes + let needed_slot_bytes = slot_bytes .checked_sub(next_slots_overlapping_bytes) - .unwrap_or(0); - (num_stripes, needed_guard_bytes) + .unwrap_or(0) + .max(max_memory_bytes); + (num_stripes, needed_slot_bytes) }; // The page-aligned slot size; equivalent to `memory_and_guard_size`. let page_alignment = crate::page_size() - 1; - let slot_bytes = max_memory_bytes - .checked_add(needed_guard_bytes) - .and_then(|slot_bytes| slot_bytes.checked_add(page_alignment)) + let slot_bytes = slot_bytes + .checked_add(page_alignment) .and_then(|slot_bytes| Some(slot_bytes & !page_alignment)) .ok_or_else(|| anyhow!("slot size is too large"))?; // We may need another guard region (like `pre_slab_guard_bytes`) at the end // of our slab. We could be conservative and just create it as large as - // `guard_bytes`, but because we know that the last slot already has a - // region as large as `needed_guard_bytes`, we can reduce the final guard - // region by that much. - let post_slab_guard_bytes = guard_bytes - needed_guard_bytes; - - // The final layout (where `n = num_memory_slots`): - // ┌────────────────────┬──────┬──────┬───┬──────┬─────────────────────┐ - // │pre_slab_guard_bytes│slot 1│slot 2│...│slot n│post_slab_guard_bytes│ - // └────────────────────┴──────┴──────┴───┴──────┴─────────────────────┘ - let total_slab_bytes = slot_bytes - .checked_mul(num_memory_slots) - .and_then(|c| c.checked_add(pre_slab_guard_bytes)) - .and_then(|c| c.checked_add(post_slab_guard_bytes)) - .ok_or_else(|| anyhow!("total size of memory reservation exceeds addressable memory"))?; - - Ok(SlabLayout { - total_slab_bytes, + // `guard_bytes`, but because we know that the last slot already has + // `guard_bytes` factored in to its guard region, we can reduce the final + // guard region by that much. + let post_slab_guard_bytes = guard_bytes + .checked_sub(slot_bytes - max_memory_bytes) + .unwrap_or(0); + + // Check that we haven't exceeded the slab we can calculate given the limits + // of `usize`. + let layout = SlabLayout { + num_slots, + slot_bytes, + max_memory_bytes, pre_slab_guard_bytes, post_slab_guard_bytes, - slot_bytes, num_stripes, - }) + }; + match layout.total_slab_bytes() { + Ok(_) => Ok(layout), + Err(e) => Err(e), + } } #[cfg(test)] @@ -713,16 +780,16 @@ mod tests { }, )?; - assert_eq!(pool.memory_and_guard_size, WASM_PAGE_SIZE as usize); - assert_eq!(pool.max_total_memories, 5); - assert_eq!(pool.max_accessible, WASM_PAGE_SIZE as usize); + assert_eq!(pool.layout.slot_bytes, WASM_PAGE_SIZE as usize); + assert_eq!(pool.layout.num_slots, 5); + assert_eq!(pool.layout.max_memory_bytes, WASM_PAGE_SIZE as usize); let base = pool.mapping.as_ptr() as usize; for i in 0..5 { let index = MemoryAllocationIndex(i); let ptr = pool.get_base(index); - assert_eq!(ptr as usize - base, i as usize * pool.memory_and_guard_size); + assert_eq!(ptr as usize - base, i as usize * pool.layout.slot_bytes); } Ok(()) @@ -747,10 +814,11 @@ mod tests { }, ) .unwrap(); - assert_eq!(pool.memories.memory_size, 2 * 65536); + assert_eq!(pool.memories.layout.max_memory_bytes, 2 * 65536); } #[test] + #[cfg_attr(miri, ignore)] fn test_pooling_allocator_striping() { if !mpk::is_supported() { println!("skipping `test_pooling_allocator_striping` test; mpk is not supported"); @@ -759,7 +827,7 @@ mod tests { // Force the use of MPK. let config = PoolingInstanceAllocatorConfig { - memory_protection_keys: AutoEnabled::Enable, + memory_protection_keys: MpkEnabled::Enable, ..PoolingInstanceAllocatorConfig::default() }; let pool = MemoryPool::new(&config, &Tunables::default()).unwrap(); @@ -780,20 +848,23 @@ mod tests { fn check_known_layout_calculations() { for num_pkeys_available in 0..16 { for num_memory_slots in [0, 1, 10, 64] { - for max_memory_bytes in - [0, 1 * WASM_PAGE_SIZE as usize, 10 * WASM_PAGE_SIZE as usize] - { - for guard_bytes in [0, 2 << 30 /* 2GB */] { - for guard_before_slots in [true, false] { - let constraints = SlabConstraints { - max_memory_bytes, - num_memory_slots, - num_pkeys_available, - guard_bytes, - guard_before_slots, - }; - let layout = calculate(&constraints); - assert_slab_layout_invariants(constraints, layout.unwrap()); + for expected_slot_bytes in [0, 1 << 30 /* 1GB */, 4 << 30 /* 4GB */] { + for max_memory_bytes in + [0, 1 * WASM_PAGE_SIZE as usize, 10 * WASM_PAGE_SIZE as usize] + { + for guard_bytes in [0, 2 << 30 /* 2GB */] { + for guard_before_slots in [true, false] { + let constraints = SlabConstraints { + max_memory_bytes, + num_slots: num_memory_slots, + expected_slot_bytes, + num_pkeys_available, + guard_bytes, + guard_before_slots, + }; + let layout = calculate(&constraints); + assert_slab_layout_invariants(constraints, layout.unwrap()); + } } } } @@ -803,6 +874,7 @@ mod tests { proptest! { #[test] + #[cfg_attr(miri, ignore)] fn check_random_layout_calculations(c in constraints()) { if let Ok(l) = calculate(&c) { assert_slab_layout_invariants(c, l); @@ -816,19 +888,22 @@ mod tests { any::(), any::(), any::(), + any::(), any::(), ) .prop_map( |( max_memory_bytes, num_memory_slots, + expected_slot_bytes, num_pkeys_available, guard_bytes, guard_before_slots, )| { SlabConstraints { max_memory_bytes, - num_memory_slots, + num_slots: num_memory_slots, + expected_slot_bytes, num_pkeys_available, guard_bytes, guard_before_slots, @@ -840,20 +915,40 @@ mod tests { fn assert_slab_layout_invariants(c: SlabConstraints, s: SlabLayout) { // Check that all the sizes add up. assert_eq!( - s.total_slab_bytes, - s.pre_slab_guard_bytes + s.slot_bytes * c.num_memory_slots + s.post_slab_guard_bytes, + s.total_slab_bytes().unwrap(), + s.pre_slab_guard_bytes + s.slot_bytes * c.num_slots + s.post_slab_guard_bytes, "the slab size does not add up: {c:?} => {s:?}" ); + assert!( + s.slot_bytes >= s.max_memory_bytes, + "slot is not big enough: {c:?} => {s:?}" + ); - // Check that the memory slot size is page-aligned. + // Check that the various memory values are page-aligned. assert!( - s.slot_bytes % crate::page_size() == 0, - "slot size is not page-aligned: {c:?} => {s:?}", + is_aligned(s.slot_bytes), + "slot is not page-aligned: {c:?} => {s:?}", + ); + assert!( + is_aligned(s.max_memory_bytes), + "slot guard region is not page-aligned: {c:?} => {s:?}", + ); + assert!( + is_aligned(s.pre_slab_guard_bytes), + "pre-slab guard region is not page-aligned: {c:?} => {s:?}" + ); + assert!( + is_aligned(s.post_slab_guard_bytes), + "post-slab guard region is not page-aligned: {c:?} => {s:?}" + ); + assert!( + is_aligned(s.total_slab_bytes().unwrap()), + "slab is not page-aligned: {c:?} => {s:?}" ); // Check that we use no more or less stripes than needed. assert!(s.num_stripes >= 1, "not enough stripes: {c:?} => {s:?}"); - if c.num_pkeys_available == 0 || c.num_memory_slots == 0 { + if c.num_pkeys_available == 0 || c.num_slots == 0 { assert_eq!( s.num_stripes, 1, "expected at least one stripe: {c:?} => {s:?}" @@ -864,13 +959,16 @@ mod tests { "layout has more stripes than available pkeys: {c:?} => {s:?}" ); assert!( - s.num_stripes <= c.num_memory_slots, + s.num_stripes <= c.num_slots, "layout has more stripes than memory slots: {c:?} => {s:?}" ); } // Check that we use the minimum number of stripes/protection keys. - // - if the next slot is bigger + // - if the next MPK-protected slot is bigger or the same as the + // required guard region, we only need two stripes + // - if the next slot is smaller than the guard region, we only need + // enough stripes to add up to at least that guard region size. if c.num_pkeys_available > 1 && c.max_memory_bytes > 0 { assert!( s.num_stripes <= (c.guard_bytes / c.max_memory_bytes) + 2, @@ -879,11 +977,22 @@ mod tests { } // Check that the memory-striping will not allow OOB access. - if s.num_stripes > 1 { - assert!( - s.slot_bytes * (s.num_stripes - 1) >= c.guard_bytes, - "layout may allow OOB access: {c:?} => {s:?}" - ); - } + // - we may have reduced the slot size from `expected_slot_bytes` to + // `slot_bytes` assuming MPK striping; we check that the expectation + // still holds + // - the last slot won't have MPK striping after it; we check that the + // `post_slab_guard_bytes` accounts for this + assert!( + s.slot_bytes * s.num_stripes >= c.expected_slot_bytes + c.guard_bytes, + "slot may allow OOB access: {c:?} => {s:?}" + ); + assert!( + s.slot_bytes + s.post_slab_guard_bytes >= c.expected_slot_bytes, + "last slot may allow OOB access: {c:?} => {s:?}" + ); + } + + fn is_aligned(bytes: usize) -> bool { + bytes % crate::page_size() == 0 } } diff --git a/crates/runtime/src/lib.rs b/crates/runtime/src/lib.rs index ed2c4415ded6..9b5e631768be 100644 --- a/crates/runtime/src/lib.rs +++ b/crates/runtime/src/lib.rs @@ -31,6 +31,7 @@ mod vmcontext; pub mod debug_builtins; pub mod libcalls; +pub mod mpk; pub use wasmtime_jit_debug::gdb_jit_int::GdbJitImageRegistration; @@ -50,6 +51,7 @@ pub use crate::memory::{ }; pub use crate::mmap::Mmap; pub use crate::mmap_vec::MmapVec; +pub use crate::mpk::MpkEnabled; pub use crate::store_box::*; pub use crate::table::{Table, TableElement}; pub use crate::traphandlers::*; diff --git a/crates/runtime/src/mpk/disabled.rs b/crates/runtime/src/mpk/disabled.rs index 777fe3278e5d..bbeccc80e893 100644 --- a/crates/runtime/src/mpk/disabled.rs +++ b/crates/runtime/src/mpk/disabled.rs @@ -14,13 +14,13 @@ pub fn keys() -> &'static [ProtectionKey] { pub fn allow(_: ProtectionMask) {} #[derive(Clone, Copy, Debug)] -pub struct ProtectionKey; +pub enum ProtectionKey {} impl ProtectionKey { pub fn protect(&self, _: &mut [u8]) -> Result<()> { - Ok(()) + match *self {} } pub fn as_stripe(&self) -> usize { - 0 + match *self {} } } diff --git a/crates/runtime/src/mpk/enabled.rs b/crates/runtime/src/mpk/enabled.rs index 087a685c3f25..37456ef13c44 100644 --- a/crates/runtime/src/mpk/enabled.rs +++ b/crates/runtime/src/mpk/enabled.rs @@ -7,7 +7,6 @@ use std::sync::OnceLock; /// Check if the MPK feature is supported. pub fn is_supported() -> bool { cfg!(target_os = "linux") && cfg!(target_arch = "x86_64") && pkru::has_cpuid_bit_set() - // TODO: we cannot check CR4 due to privilege } /// Allocate all protection keys available to this process. @@ -45,44 +44,28 @@ static KEYS: OnceLock> = OnceLock::new(); /// Any accesses to pages marked by another key will result in a `SIGSEGV` /// fault. pub fn allow(mask: ProtectionMask) { - let mut allowed = 0; - for i in 0..16 { - if mask.0 & (1 << i) == 1 { - allowed |= 0b11 << (i * 2); - } - } - let previous = pkru::read(); - pkru::write(pkru::DISABLE_ACCESS ^ allowed); - log::debug!("PKRU change: {:#034b} => {:#034b}", previous, pkru::read()); + pkru::write(mask.0); + log::trace!("PKRU change: {:#034b} => {:#034b}", previous, pkru::read()); } /// An MPK protection key. /// /// The expected usage is: -/// - allocate a new key with [`Pkey::new`] -/// - mark some regions of memory as accessible with [`Pkey::protect`] +/// - receive system-allocated keys from [`keys`] +/// - mark some regions of memory as accessible with [`ProtectionKey::protect`] /// - [`allow`] or disallow access to the memory regions using a /// [`ProtectionMask`]; any accesses to unmarked pages result in a fault /// - drop the key -/// -/// Since this kernel is allocated from the kernel, we must inform the kernel -/// when it is dropped. Similarly, to retrieve all available protection keys, -/// one must request them from the kernel (e.g., call [`Pkey::new`] until it -/// fails). -/// -/// Because MPK may not be available on all systems, [`Pkey`] wraps an `Option` -/// that will always be `None` if MPK is not supported. The idea here is that -/// the API can remain the same regardless of MPK support. #[derive(Clone, Copy, Debug)] pub struct ProtectionKey(u32); impl ProtectionKey { - /// Mark a page as protected by this [`Pkey`]. + /// Mark a page as protected by this [`ProtectionKey`]. /// /// This "colors" the pages of `region` via a kernel `pkey_mprotect` call to - /// only allow reads and writes when this [`Pkey`] is activated (see - /// [`Pkey::activate`]). + /// only allow reads and writes when this [`ProtectionKey`] is activated + /// (see [`allow`]). /// /// # Errors /// @@ -99,7 +82,7 @@ impl ProtectionKey { }) } - /// Convert the [`Pkey`] to its 0-based index; this is useful for + /// Convert the [`ProtectionKey`] to its 0-based index; this is useful for /// determining which allocation "stripe" a key belongs to. /// /// This function assumes that the kernel has allocated key 0 for itself. @@ -109,30 +92,51 @@ impl ProtectionKey { } } -/// A bit field indicating which protection keys should be *allowed*. +/// A bit field indicating which protection keys should be allowed and disabled. /// -/// When bit `n` is set, it means the protection key is allowed--conversely, -/// protection is disabled for this key. -pub struct ProtectionMask(u16); +/// The internal representation makes it easy to use [`ProtectionMask`] directly +/// with the PKRU register. When bits `n` and `n+1` are set, it means the +/// protection key is *not* allowed (see the PKRU write and access disabled +/// bits). +pub struct ProtectionMask(u32); impl ProtectionMask { /// Allow access from all protection keys. + #[inline] pub fn all() -> Self { - Self(u16::MAX) + Self(pkru::ALLOW_ACCESS) } /// Only allow access to memory protected with protection key 0; note that /// this does not mean "none" but rather allows access from the default /// kernel protection key. + #[inline] pub fn zero() -> Self { - Self(1) + Self(pkru::DISABLE_ACCESS ^ 0b11) } /// Include `pkey` as another allowed protection key in the mask. + #[inline] pub fn or(self, pkey: ProtectionKey) -> Self { - Self(self.0 | 1 << pkey.0) + let mask = pkru::DISABLE_ACCESS ^ 0b11 << (pkey.0 * 2); + Self(self.0 & mask) } } +/// Helper macro for skipping tests on systems that do not have MPK enabled +/// (e.g., older architecture, disabled by kernel, etc.) +#[cfg(test)] +macro_rules! skip_if_mpk_unavailable { + () => { + if !crate::mpk::is_supported() { + println!("> mpk is not supported: ignoring test"); + return; + } + }; +} +/// Necessary for inter-module access. +#[cfg(test)] +pub(crate) use skip_if_mpk_unavailable; + #[cfg(test)] mod tests { use super::*; @@ -151,6 +155,7 @@ mod tests { #[test] fn check_invalid_mark() { + skip_if_mpk_unavailable!(); let pkey = keys()[0]; let unaligned_region = unsafe { let addr = 1 as *mut u8; // this is not page-aligned! @@ -164,4 +169,25 @@ mod tests { "failed to mark region with pkey (addr = 0x1, len = 1, prot = 0b11)" ); } + + #[test] + fn check_masking() { + skip_if_mpk_unavailable!(); + let original = pkru::read(); + + allow(ProtectionMask::all()); + assert_eq!(0, pkru::read()); + + allow(ProtectionMask::all().or(ProtectionKey(5))); + assert_eq!(0, pkru::read()); + + allow(ProtectionMask::zero()); + assert_eq!(0b11111111_11111111_11111111_11111100, pkru::read()); + + allow(ProtectionMask::zero().or(ProtectionKey(5))); + assert_eq!(0b11111111_11111111_11110011_11111100, pkru::read()); + + // Reset the PKRU state to what we originally observed. + pkru::write(original); + } } diff --git a/crates/runtime/src/mpk/mod.rs b/crates/runtime/src/mpk/mod.rs index 81156183cf2a..bfffe24118f7 100644 --- a/crates/runtime/src/mpk/mod.rs +++ b/crates/runtime/src/mpk/mod.rs @@ -5,11 +5,11 @@ //! AMD CPUs. In Linux, this feature is named `pku` (protection keys userspace) //! and consists of three new system calls: `pkey_alloc`, `pkey_free`, and //! `pkey_mprotect` (see the [Linux documentation]). This crate provides an -//! abstraction, [`Pkey`], that the [pooling allocator] applies to contiguous -//! memory allocations, allowing it to avoid guard pages in some cases and more -//! efficiently use memory. This technique was first presented in a 2022 paper: -//! [Segue and ColorGuard: Optimizing SFI Performance and Scalability on Modern -//! x86][colorguard]. +//! abstraction, [`ProtectionKey`], that the [pooling allocator] applies to +//! contiguous memory allocations, allowing it to avoid guard pages in some +//! cases and more efficiently use memory. This technique was first presented in +//! a 2022 paper: [Segue and ColorGuard: Optimizing SFI Performance and +//! Scalability on Modern x86][colorguard]. //! //! [pooling allocator]: crate::PoolingInstanceAllocator //! [Linux documentation]: @@ -19,23 +19,36 @@ //! On x86_64 Linux systems, this module implements the various parts necessary //! to use MPK in Wasmtime: //! - [`is_supported`] indicates whether the feature is available at runtime -//! - [`Pkey`] provides safe access to the kernel-allocated protection keys +//! - [`ProtectionKey`] provides access to the kernel-allocated protection keys +//! (see [`keys`]) +//! - [`allow`] sets the CPU state to prevent access to regions outside the +//! [`ProtectionMask`] //! - the `sys` module bridges the gap to Linux's `pkey_*` system calls //! - the `pkru` module controls the x86 `PKRU` register (and other CPU state) //! //! On any other kind of machine, this module exposes noop implementations of //! the public interface. -#[cfg(all(target_arch = "x86_64", target_os = "linux"))] -mod enabled; -#[cfg(all(target_arch = "x86_64", target_os = "linux"))] -mod pkru; -#[cfg(all(target_arch = "x86_64", target_os = "linux"))] -mod sys; -#[cfg(all(target_arch = "x86_64", target_os = "linux"))] -pub use enabled::{allow, is_supported, keys, ProtectionKey, ProtectionMask}; +cfg_if::cfg_if! { + if #[cfg(all(target_arch = "x86_64", target_os = "linux", not(miri)))] { + mod enabled; + mod pkru; + mod sys; + pub use enabled::{allow, is_supported, keys, ProtectionKey, ProtectionMask}; + } else { + mod disabled; + pub use disabled::{allow, is_supported, keys, ProtectionKey, ProtectionMask}; + } +} -#[cfg(not(all(target_arch = "x86_64", target_os = "linux")))] -mod disabled; -#[cfg(not(all(target_arch = "x86_64", target_os = "linux")))] -pub use disabled::{allow, is_supported, keys, ProtectionKey, ProtectionMask}; +/// Describe the tri-state configuration of memory protection keys (MPK). +#[derive(Clone, Copy, Debug)] +pub enum MpkEnabled { + /// Use MPK if supported by the current system; fall back to guard regions + /// otherwise. + Auto, + /// Use MPK or fail if not supported. + Enable, + /// Do not use MPK. + Disable, +} diff --git a/crates/runtime/src/mpk/pkru.rs b/crates/runtime/src/mpk/pkru.rs index a7e3bcde185f..99dee74fa0f7 100644 --- a/crates/runtime/src/mpk/pkru.rs +++ b/crates/runtime/src/mpk/pkru.rs @@ -23,8 +23,7 @@ use core::arch::asm; /// This `PKRU` register mask allows access to any pages marked with any /// key--in other words, reading and writing is permitted to all pages. -#[cfg(test)] -const ALLOW_ACCESS: u32 = 0; +pub const ALLOW_ACCESS: u32 = 0; /// This `PKRU` register mask disables access to any page marked with any /// key--in other words, no reading or writing to all pages. @@ -61,25 +60,15 @@ pub fn has_cpuid_bit_set() -> bool { (result.ecx & 0b100) != 0 } -/// Check that the `CR4.PKE` flag (bit 22) is set; see the Intel Software -/// Development Manual, vol 3a, section 2.7. This register can only be -/// accessed from privilege level 0. -#[cfg(test)] -fn has_cr4_bit_set() -> bool { - let cr4: u64; - unsafe { - asm!("mov {}, cr4", out(reg) cr4, options(nomem, nostack, preserves_flags)); - } - (cr4 & (1 << 22)) != 0 -} - #[cfg(test)] mod tests { use super::*; + use crate::mpk::enabled::skip_if_mpk_unavailable; #[test] #[ignore = "cannot be run with other tests that munge the PKRU register"] fn check_read() { + skip_if_mpk_unavailable!(); assert_eq!(read(), DISABLE_ACCESS ^ 1); // By default, the Linux kernel only allows a process to access key 0, // the default kernel key. @@ -87,9 +76,10 @@ mod tests { #[test] fn check_roundtrip() { + skip_if_mpk_unavailable!(); let pkru = read(); // Allow access to pages marked with any key. - write(0); + write(ALLOW_ACCESS); assert_eq!(read(), ALLOW_ACCESS); // Restore the original value. write(pkru); diff --git a/crates/runtime/src/mpk/sys.rs b/crates/runtime/src/mpk/sys.rs index 5d357314a7e2..fe68decce8c2 100644 --- a/crates/runtime/src/mpk/sys.rs +++ b/crates/runtime/src/mpk/sys.rs @@ -10,7 +10,8 @@ //! [`pkeys`]: https://man7.org/linux/man-pages/man7/pkeys.7.html use crate::page_size; -use anyhow::{anyhow, Result}; +use anyhow::Result; +use std::io::Error; /// Protection mask allowing reads of pkey-protected memory (see `prot` in /// [`pkey_mprotect`]). @@ -28,13 +29,15 @@ pub const PROT_WRITE: u32 = libc::PROT_WRITE as u32; // == 0b0010; /// Each process has its own separate pkey index; e.g., if process `m` /// allocates key 1, process `n` can as well. pub fn pkey_alloc(flags: u32, access_rights: u32) -> Result { - debug_assert_eq!(flags, 0); // reserved for future use--must be 0. + assert_eq!(flags, 0); // reserved for future use--must be 0. let result = unsafe { libc::syscall(libc::SYS_pkey_alloc, flags, access_rights) }; if result >= 0 { - Ok(result.try_into().expect("TODO")) + Ok(result + .try_into() + .expect("only pkey IDs between 0 and 15 are expected")) } else { debug_assert_eq!(result, -1); // only this error result is expected. - Err(anyhow!(unsafe { errno_as_string() })) + Err(Error::last_os_error().into()) } } @@ -48,7 +51,7 @@ pub fn pkey_free(key: u32) -> Result<()> { Ok(()) } else { debug_assert_eq!(result, -1); // only this error result is expected. - Err(anyhow!(unsafe { errno_as_string() })) + Err(Error::last_os_error().into()) } } @@ -67,22 +70,10 @@ pub fn pkey_mprotect(addr: usize, len: usize, prot: u32, key: u32) -> Result<()> Ok(()) } else { debug_assert_eq!(result, -1); // only this error result is expected. - Err(anyhow!(unsafe { errno_as_string() })) + Err(Error::last_os_error().into()) } } -/// Helper function for retrieving the libc error message for the current -/// error (see GNU libc's ["Checking for Errors"] documentation). -/// -/// ["Checking for Errors"]: https://www.gnu.org/software/libc/manual/html_node/Checking-for-Errors.html -unsafe fn errno_as_string() -> String { - let errno = *libc::__errno_location(); - let err_ptr = libc::strerror(errno); - std::ffi::CStr::from_ptr(err_ptr) - .to_string_lossy() - .into_owned() -} - #[cfg(test)] mod tests { use super::*; @@ -106,13 +97,16 @@ mod tests { fn check_invalid_free() { let result = pkey_free(42); assert!(result.is_err()); - assert_eq!(result.unwrap_err().to_string(), "Invalid argument"); + assert_eq!( + result.unwrap_err().to_string(), + "Invalid argument (os error 22)" + ); } #[test] #[should_panic] fn check_invalid_alloc_flags() { - pkey_alloc(42, 0).unwrap(); + let _ = pkey_alloc(42, 0); } #[test] diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index 7dc15c020188..1fdec89379a7 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -14,9 +14,10 @@ use wasmparser::WasmFeatures; use wasmtime_cache::CacheConfig; use wasmtime_environ::Tunables; use wasmtime_jit::profiling::{self, ProfilingAgent}; -use wasmtime_runtime::{InstanceAllocator, OnDemandInstanceAllocator, RuntimeMemoryCreator}; +use wasmtime_runtime::{mpk, InstanceAllocator, OnDemandInstanceAllocator, RuntimeMemoryCreator}; pub use wasmtime_environ::CacheStore; +pub use wasmtime_runtime::MpkEnabled; /// Represents the module instance allocation strategy to use. #[derive(Clone)] @@ -1186,7 +1187,7 @@ impl Config { /// /// Note that the pooling allocator can reduce the amount of memory needed /// for pooling allocation by using memory protection; see - /// [`Config::memory_protection_keys`] for details. + /// `PoolingAllocatorConfig::memory_protection_keys` for details. pub fn static_memory_maximum_size(&mut self, max_size: u64) -> &mut Self { let max_pages = max_size / u64::from(wasmtime_environ::WASM_PAGE_SIZE); self.tunables.static_memory_bound = max_pages; @@ -2313,16 +2314,15 @@ impl PoolingAllocationConfig { /// /// When using the pooling allocator (see [`Config::allocation_strategy`], /// [`InstanceAllocationStrategy::Pooling`]), memory protection keys can - /// reduce the total amount of allocated memory by eliminating guard regions - /// between WebAssembly memories in the pool. It does so by "coloring" - /// memory regions with different memory keys and setting which regions are - /// accessible each time executions switches from host to guest (or vice - /// versa). + /// reduce the total amount of allocated virtual memory by eliminating guard + /// regions between WebAssembly memories in the pool. It does so by + /// "coloring" memory regions with different memory keys and setting which + /// regions are accessible each time executions switches from host to guest + /// (or vice versa). /// /// MPK is only available on Linux (called `pku` there) and recent x86 - /// systems. Checking for support at runtime is possible with - /// [`mpk::is_supported`][wasmtime_runtime::mpk::is_supported]. This - /// configuration setting can be in three states: + /// systems; we check for MPK support at runtime by examining the `CPUID` + /// register. This configuration setting can be in three states: /// /// - `auto`: if MPK support is available the guard regions are removed; if /// not, the guard regions remain @@ -2330,11 +2330,26 @@ impl PoolingAllocationConfig { /// supported /// - `disable`: never use MPK /// - /// By default this value is `auto`. - pub fn memory_protection_keys(&mut self, enable: wasmtime_runtime::AutoEnabled) -> &mut Self { + /// By default this value is `disabled`, but may become `auto` in future releases. + /// + /// __WARNING__: this configuration options is still experimental--use at + /// your own risk! MPK uses kernel and CPU features to protect memory + /// regions; you may observe segmentation faults if anything is + /// misconfigured. + pub fn memory_protection_keys(&mut self, enable: MpkEnabled) -> &mut Self { self.config.memory_protection_keys = enable; self } + + /// Check if memory protection keys (MPK) are available on the current host. + /// + /// This is a convenience method for determining MPK availability using the + /// same method that [`MpkEnabled::Auto`] does. See + /// [`PoolingAllocationConfig::memory_protection_keys`] for more + /// information. + pub fn are_memory_protection_keys_available(&self) -> bool { + mpk::is_supported() + } } pub(crate) fn probestack_supported(arch: Architecture) -> bool {