diff --git a/.github/workflows/code_quality-aarch64-darwin.yml b/.github/workflows/code_quality-aarch64-darwin.yml index 63e9e3b0..f3094271 100644 --- a/.github/workflows/code_quality-aarch64-darwin.yml +++ b/.github/workflows/code_quality-aarch64-darwin.yml @@ -30,7 +30,10 @@ jobs: run: cargo fmt -- --check - name: Clippy (default features) - run: cargo clippy -- -D warnings + run: cargo clippy --target aarch64-apple-darwin -- -D warnings - name: Clippy (net feature) - run: cargo clippy --features net -- -D warnings + run: cargo clippy --target aarch64-apple-darwin --features net -- -D warnings + + - name: Clippy (efi feature) + run: cargo clippy --target aarch64-apple-darwin --features efi -- -D warnings diff --git a/Makefile b/Makefile index 3a5e4ae7..ca542b5b 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,7 @@ SNP_INIT_SRC = init/tee/snp_attest.c \ KBS_LD_FLAGS = -lcurl -lidn2 -lssl -lcrypto -lzstd -lz -lbrotlidec-static \ -lbrotlicommon-static +BUILD_INIT = 1 INIT_DEFS = ifeq ($(SEV),1) VARIANT = -sev @@ -24,10 +25,16 @@ ifeq ($(SEV),1) INIT_DEFS += -DSEV=1 INIT_DEFS += $(KBS_LD_FLAGS) INIT_SRC += $(SNP_INIT_SRC) + BUILD_INIT = 0 endif ifeq ($(NET),1) FEATURE_FLAGS += --features net endif +ifeq ($(EFI),1) + VARIANT = -efi + FEATURE_FLAGS := --features efi + BUILD_INIT = 0 +endif ifeq ($(ROSETTA),1) INIT_DEFS += -D__ROSETTA__ @@ -42,9 +49,9 @@ KRUN_BINARY_Linux = libkrun$(VARIANT).so.$(FULL_VERSION) KRUN_SONAME_Linux = libkrun$(VARIANT).so.$(ABI_VERSION) KRUN_BASE_Linux = libkrun$(VARIANT).so -KRUN_BINARY_Darwin = libkrun.$(FULL_VERSION).dylib -KRUN_SONAME_Darwin = libkrun.$(ABI_VERSION).dylib -KRUN_BASE_Darwin = libkrun.dylib +KRUN_BINARY_Darwin = libkrun$(VARIANT).$(FULL_VERSION).dylib +KRUN_SONAME_Darwin = libkrun$(VARIANT).$(ABI_VERSION).dylib +KRUN_BASE_Darwin = libkrun$(VARIANT).dylib LIBRARY_RELEASE_Linux = target/release/$(KRUN_BINARY_Linux) LIBRARY_DEBUG_Linux = target/debug/$(KRUN_BINARY_Linux) @@ -64,7 +71,7 @@ all: $(LIBRARY_RELEASE_$(OS)) libkrun.pc debug: $(LIBRARY_DEBUG_$(OS)) libkrun.pc -ifneq ($(SEV),1) +ifeq ($(BUILD_INIT),1) INIT_BINARY = init/init $(INIT_BINARY): $(INIT_SRC) gcc -O2 -static -Wall $(INIT_DEFS) -o $@ $(INIT_SRC) $(INIT_DEFS) diff --git a/edk2/KRUN_EFI.silent.fd b/edk2/KRUN_EFI.silent.fd new file mode 100644 index 00000000..4a5ba251 Binary files /dev/null and b/edk2/KRUN_EFI.silent.fd differ diff --git a/edk2/License.txt b/edk2/License.txt new file mode 100644 index 00000000..ee840505 --- /dev/null +++ b/edk2/License.txt @@ -0,0 +1,51 @@ +Copyright (c) 2019, TianoCore and contributors. All rights reserved. + +SPDX-License-Identifier: BSD-2-Clause-Patent + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +Subject to the terms and conditions of this license, each copyright holder +and contributor hereby grants to those receiving rights under this license +a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except for failure to satisfy the conditions of this license) patent +license to make, have made, use, offer to sell, sell, import, and otherwise +transfer this software, where such license applies only to those patent +claims, already acquired or hereafter acquired, licensable by such copyright +holder or contributor that are necessarily infringed by: + +(a) their Contribution(s) (the licensed copyrights of copyright holders and + non-copyrightable additions of contributors, in source or binary form) + alone; or + +(b) combination of their Contribution(s) with the work of authorship to + which such Contribution(s) was added by such copyright holder or + contributor, if, at the time the Contribution is added, such addition + causes such combination to be necessarily infringed. The patent license + shall not apply to any other combinations which include the + Contribution. + +Except as expressly stated above, no rights or licenses from any copyright +holder or contributor is granted under this license, whether expressly, by +implication, estoppel or otherwise. + +DISCLAIMER + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/edk2/Sources.txt b/edk2/Sources.txt new file mode 100644 index 00000000..2c155536 --- /dev/null +++ b/edk2/Sources.txt @@ -0,0 +1 @@ +KRUN_EFI.silent.fd was built from commit 82563b1d62ba029bd7bbfff73d49cce21af302c0 of the https://github.com/slp/edk2 repository. diff --git a/examples/Makefile b/examples/Makefile index 39015530..8c163059 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -4,29 +4,37 @@ LDFLAGS_x86_64_Linux = -lkrun LDFLAGS_aarch64_Linux = -lkrun LDFLAGS_arm64_Darwin = -L/opt/homebrew/lib -lkrun LDFLAGS_sev = -lkrun-sev -CFLAGS_Linux = -O2 -g -CFLAGS_Darwin = -O2 -g -I/opt/homebrew/include +LDFLAGS_efi = -L/opt/homebrew/lib -lkrun-efi +CFLAGS = -O2 -g -I../include ROOTFS_DISTRO := fedora ROOTFS_DIR = rootfs_$(ROOTFS_DISTRO) .PHONY: clean rootfs +EXAMPLES := chroot_vm ifeq ($(SEV),1) EXAMPLES := launch-tee -else - EXAMPLES := chroot_vm +endif +ifeq ($(EFI),1) + EXAMPLES := boot_efi endif all: $(EXAMPLES) chroot_vm: chroot_vm.c - gcc -o $@ $< $(CFLAGS_$(OS)) $(LDFLAGS_$(ARCH)_$(OS)) + gcc -o $@ $< $(CFLAGS) $(LDFLAGS_$(ARCH)_$(OS)) ifeq ($(OS),Darwin) codesign --entitlements chroot_vm.entitlements --force -s - $@ endif launch-tee: launch-tee.c - gcc -o $@ $< $(CFLAGS_$(OS)) $(LDFLAGS_sev) + gcc -o $@ $< $(CFLAGS) $(LDFLAGS_sev) + +boot_efi: boot_efi.c + gcc -o $@ $< $(CFLAGS) $(LDFLAGS_efi) +ifeq ($(OS),Darwin) + codesign --entitlements chroot_vm.entitlements --force -s - $@ +endif # Build the rootfs to be used with chroot_vm. rootfs: @@ -36,4 +44,4 @@ rootfs: podman rm libkrun_chroot_vm clean: - rm -rf chroot_vm $(ROOTFS_DIR) launch-tee + rm -rf chroot_vm $(ROOTFS_DIR) launch-tee boot_efi diff --git a/examples/boot_efi.c b/examples/boot_efi.c new file mode 100644 index 00000000..4458b8c4 --- /dev/null +++ b/examples/boot_efi.c @@ -0,0 +1,181 @@ +/* + * This is an example implementing chroot-like functionality with libkrun. + * + * It executes the requested command (relative to NEWROOT) inside a fresh + * Virtual Machine created and managed by libkrun. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_ARGS_LEN 4096 +#ifndef MAX_PATH +#define MAX_PATH 4096 +#endif + +static void print_help(char *const name) +{ + fprintf(stderr, + "Usage: %s [OPTIONS] DISK\n" + "OPTIONS: \n" + " -h --help Show help\n" + " --passt-socket=PATH Connect to passt socket at PATH" + "\n" + "DISK: path to the vm's disk image in raw format\n", + name + ); +} + +static const struct option long_options[] = { + { "help", no_argument, NULL, 'h' }, + { "passt-socket", required_argument, NULL, 'P' }, + { NULL, 0, NULL, 0 } +}; + +struct cmdline { + bool show_help; + char const *passt_socket_path; + char const *disk_image; +}; + +bool parse_cmdline(int argc, char *const argv[], struct cmdline *cmdline) +{ + assert(cmdline != NULL); + + // set the defaults + *cmdline = (struct cmdline){ + .show_help = false, + .passt_socket_path = "/tmp/network.sock", + .disk_image = NULL, + }; + + int option_index = 0; + int c; + // the '+' in optstring is a GNU extension that disables permutating argv + while ((c = getopt_long(argc, argv, "+h", long_options, &option_index)) != -1) { + switch (c) { + case 'h': + cmdline->show_help = true; + return true; + case 'P': + cmdline->passt_socket_path = optarg; + break; + case '?': + return false; + default: + fprintf(stderr, "internal argument parsing error (returned character code 0x%x)\n", c); + return false; + } + } + + if (optind <= argc - 1) { + cmdline->disk_image = argv[optind]; + return true; + } + + if (optind == argc) { + fprintf(stderr, "Missing DISK argument\n"); + } + + return false; +} + +int connect_to_passt(char *socket_path) +{ + struct sockaddr_un addr; + int socket_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (socket_fd < 0) { + perror("Failed to create passt socket fd"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, socket_path, sizeof(addr.sun_path) - 1); + + if (connect(socket_fd, (const struct sockaddr *) &addr, sizeof(addr)) < 0) { + perror("Failed to bind passt socket"); + return -1; + } + + return socket_fd; +} + +int main(int argc, char *const argv[]) +{ + int ctx_id; + int err; + struct cmdline cmdline; + + if (!parse_cmdline(argc, argv, &cmdline)) { + putchar('\n'); + print_help(argv[0]); + return -1; + } + + if (cmdline.show_help){ + print_help(argv[0]); + return 0; + } + + // Set the log level to "off". + err = krun_set_log_level(0); + if (err) { + errno = -err; + perror("Error configuring log level"); + return -1; + } + + // Create the configuration context. + ctx_id = krun_create_ctx(); + if (ctx_id < 0) { + errno = -ctx_id; + perror("Error creating configuration context"); + return -1; + } + + // Configure the number of vCPUs (2) and the amount of RAM (1024 MiB). + if (err = krun_set_vm_config(ctx_id, 2, 1024)) { + errno = -err; + perror("Error configuring the number of vCPUs and/or the amount of RAM"); + return -1; + } + + if (err = krun_set_root_disk(ctx_id, cmdline.disk_image)) { + errno = -err; + perror("Error configuring disk image"); + return -1; + } + + int passt_fd = connect_to_passt(cmdline.passt_socket_path); + + if (passt_fd < 0) { + return -1; + } + + if (err = krun_set_passt_fd(ctx_id, passt_fd)) { + errno = -err; + perror("Error configuring net mode"); + return -1; + } + + // Start and enter the microVM. Unless there is some error while creating the microVM + // this function never returns. + if (err = krun_start_enter(ctx_id)) { + errno = -err; + perror("Error creating the microVM"); + return -1; + } + + // Not reached. + return 0; +} diff --git a/src/arch/Cargo.toml b/src/arch/Cargo.toml index d296e286..6de790f2 100644 --- a/src/arch/Cargo.toml +++ b/src/arch/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [features] tee = [] amd-sev = [ "tee" ] +efi = [] [dependencies] libc = ">=0.2.39" diff --git a/src/arch/src/aarch64/fdt.rs b/src/arch/src/aarch64/fdt.rs index 84e45362..bb2e717c 100644 --- a/src/arch/src/aarch64/fdt.rs +++ b/src/arch/src/aarch64/fdt.rs @@ -172,7 +172,7 @@ fn create_memory_node( _guest_mem: &GuestMemoryMmap, arch_memory_info: &ArchMemoryInfo, ) -> Result<()> { - let mem_size = arch_memory_info.ram_last_addr - super::layout::DRAM_MEM_START + 1; + let mem_size = arch_memory_info.ram_last_addr - super::layout::DRAM_MEM_START; // See https://github.com/torvalds/linux/blob/master/Documentation/devicetree/booting-without-of.txt#L960 // for an explanation of this. let mem_reg_prop = generate_prop64(&[super::layout::DRAM_MEM_START, mem_size]); diff --git a/src/arch/src/aarch64/layout.rs b/src/arch/src/aarch64/layout.rs index ca61c790..f8b8d3aa 100644 --- a/src/arch/src/aarch64/layout.rs +++ b/src/arch/src/aarch64/layout.rs @@ -50,7 +50,10 @@ // Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). /// Start of RAM on 64 bit ARM. +#[cfg(not(feature = "efi"))] pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. +#[cfg(feature = "efi")] +pub const DRAM_MEM_START: u64 = 0x4000_0000; // 1 GB. /// The maximum addressable RAM address. pub const DRAM_MEM_END: u64 = 0x00FF_8000_0000; // 1024 - 2 = 1022 GB. /// The maximum RAM size. @@ -82,4 +85,7 @@ pub const GTIMER_VIRT: u32 = 11; pub const GTIMER_PHYS: u32 = 12; /// Below this address will reside the GIC, above this address will reside the MMIO devices. +#[cfg(not(feature = "efi"))] pub const MAPPED_IO_START: u64 = 1 << 30; // 1 GB +#[cfg(feature = "efi")] +pub const MAPPED_IO_START: u64 = 0x0a00_0000; diff --git a/src/arch/src/aarch64/macos/gicv2.rs b/src/arch/src/aarch64/macos/gicv2.rs index 07806740..21dd161a 100644 --- a/src/arch/src/aarch64/macos/gicv2.rs +++ b/src/arch/src/aarch64/macos/gicv2.rs @@ -61,7 +61,7 @@ impl GICDevice for GICv2 { } fn fdt_compatibility(&self) -> &str { - "arm,gic-400" + "arm,cortex-a15-gic" } fn fdt_maint_irq(&self) -> u32 { diff --git a/src/arch/src/aarch64/mod.rs b/src/arch/src/aarch64/mod.rs index 385b1a99..79845218 100644 --- a/src/arch/src/aarch64/mod.rs +++ b/src/arch/src/aarch64/mod.rs @@ -69,10 +69,17 @@ pub fn arch_memory_regions(size: usize) -> (ArchMemoryInfo, Vec<(GuestAddress, u shm_start_addr: 0, shm_size: 0, }; - ( - info, - vec![(GuestAddress(layout::DRAM_MEM_START), dram_size)], - ) + let regions = if cfg!(feature = "efi") { + vec![ + // Space for loading EDK2 and its variables + (GuestAddress(0u64), 0x800_0000), + (GuestAddress(layout::DRAM_MEM_START), dram_size), + ] + } else { + vec![(GuestAddress(layout::DRAM_MEM_START), dram_size)] + }; + + (info, regions) } /// Configures the system and should be called once per vm before starting vcpu threads. @@ -129,13 +136,17 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> supe } // Auxiliary function to get the address where the device tree blob is loaded. -pub fn get_fdt_addr(mem: &GuestMemoryMmap) -> u64 { +pub fn get_fdt_addr(_mem: &GuestMemoryMmap) -> u64 { // If the memory allocated is smaller than the size allocated for the FDT, // we return the start of the DRAM so that // we allow the code to try and load the FDT. - if let Some(addr) = mem.last_addr().checked_sub(layout::FDT_MAX_SIZE as u64 - 1) { - if mem.address_in_range(addr) { + #[cfg(not(feature = "efi"))] + if let Some(addr) = _mem + .last_addr() + .checked_sub(layout::FDT_MAX_SIZE as u64 - 1) + { + if _mem.address_in_range(addr) { return addr.raw_value(); } } diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index 267a4f1b..aa16759a 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -6,8 +6,10 @@ edition = "2021" [features] tee = [] -amd-sev = ["tee"] +amd-sev = ["blk", "tee"] net = [] +blk = [] +efi = ["blk", "net"] [dependencies] bitflags = "1.2.0" diff --git a/src/devices/src/legacy/x86_64/serial.rs b/src/devices/src/legacy/x86_64/serial.rs index f98ad32c..7cb3b607 100644 --- a/src/devices/src/legacy/x86_64/serial.rs +++ b/src/devices/src/legacy/x86_64/serial.rs @@ -73,7 +73,7 @@ pub struct Serial { } impl Serial { - fn new( + pub fn new( interrupt_evt: EventFd, out: Option>, input: Option>, diff --git a/src/devices/src/virtio/block/device.rs b/src/devices/src/virtio/block/device.rs index fa06fdf9..779d5752 100644 --- a/src/devices/src/virtio/block/device.rs +++ b/src/devices/src/virtio/block/device.rs @@ -9,14 +9,17 @@ use std::cmp; use std::convert::From; use std::fs::{File, OpenOptions}; use std::io::{self, Seek, SeekFrom, Write}; +#[cfg(target_os = "linux")] use std::os::linux::fs::MetadataExt; +#[cfg(target_os = "macos")] +use std::os::macos::fs::MetadataExt; use std::path::PathBuf; use std::result; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex}; use log::{error, warn}; -use utils::eventfd::EventFd; +use utils::eventfd::{EventFd, EFD_NONBLOCK}; use virtio_bindings::{virtio_blk::*, virtio_config::VIRTIO_F_VERSION_1}; use vm_memory::{Bytes, GuestMemoryError, GuestMemoryMmap}; @@ -207,7 +210,7 @@ impl Block { avail_features |= 1u64 << VIRTIO_BLK_F_RO; }; - let queue_evts = [EventFd::new(libc::EFD_NONBLOCK)?]; + let queue_evts = [EventFd::new(EFD_NONBLOCK)?]; let queues = QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(); @@ -220,11 +223,11 @@ impl Block { avail_features, acked_features: 0u64, interrupt_status: Arc::new(AtomicUsize::new(0)), - interrupt_evt: EventFd::new(libc::EFD_NONBLOCK)?, + interrupt_evt: EventFd::new(EFD_NONBLOCK)?, queue_evts, queues, device_state: DeviceState::Inactive, - activate_evt: EventFd::new(libc::EFD_NONBLOCK)?, + activate_evt: EventFd::new(EFD_NONBLOCK)?, intc: None, irq_line: None, }) @@ -458,4 +461,12 @@ impl VirtioDevice for Block { self.device_state = DeviceState::Activated(mem); Ok(()) } + + fn reset(&mut self) -> bool { + // Strictly speaking, we should unsubscribe the queue events resubscribe + // the activate eventfd and deactivate the device, but we don't support + // any scenario in which neither GuestMemory nor the queue events would + // change, so let's avoid doing any unnecessary work. + true + } } diff --git a/src/devices/src/virtio/console/device.rs b/src/devices/src/virtio/console/device.rs index e8cf78a4..6c68df85 100644 --- a/src/devices/src/virtio/console/device.rs +++ b/src/devices/src/virtio/console/device.rs @@ -227,6 +227,7 @@ impl Console { if self.ports[cmd.id as usize].is_console() { self.control.mark_console_port(mem, cmd.id); + self.control.port_open(cmd.id, true); } else { // We start with all ports open, this makes sense for now, // because underlying file descriptors STDIN, STDOUT, STDERR are always open too @@ -359,14 +360,23 @@ impl VirtioDevice for Console { DeviceState::Activated(_) => true, } } -} -impl VmmExitObserver for Console { - fn on_vmm_exit(&mut self) { + fn reset(&mut self) -> bool { + // Strictly speaking, we should also unsubscribe the queue + // events, resubscribe the activate eventfd and deactivate + // the device, but we don't support any scenario in which + // neither GuestMemory nor the queue events would change, + // so let's avoid doing any unnecessary work. for port in &mut self.ports { - port.flush(); + port.shutdown(); } + true + } +} +impl VmmExitObserver for Console { + fn on_vmm_exit(&mut self) { + self.reset(); log::trace!("Console on_vmm_exit finished"); } } diff --git a/src/devices/src/virtio/console/port.rs b/src/devices/src/virtio/console/port.rs index 09a25ba5..8a5ca304 100644 --- a/src/devices/src/virtio/console/port.rs +++ b/src/devices/src/virtio/console/port.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::thread::JoinHandle; use std::{mem, thread}; @@ -29,11 +29,9 @@ pub enum PortDescription { } enum PortState { - Inactive { - input: Option>, - output: Option>, - }, + Inactive, Active { + stopfd: utils::eventfd::EventFd, stop: Arc, rx_thread: Option>, tx_thread: Option>, @@ -46,6 +44,8 @@ pub(crate) struct Port { name: Cow<'static, str>, represents_console: bool, state: PortState, + input: Option>>>, + output: Option>>>, } impl Port { @@ -55,25 +55,25 @@ impl Port { port_id, name: "".into(), represents_console: true, - state: PortState::Inactive { input, output }, + state: PortState::Inactive, + input: Some(Arc::new(Mutex::new(input.unwrap()))), + output: Some(Arc::new(Mutex::new(output.unwrap()))), }, PortDescription::InputPipe { name, input } => Self { port_id, name, represents_console: false, - state: PortState::Inactive { - input: Some(input), - output: None, - }, + state: PortState::Inactive, + input: Some(Arc::new(Mutex::new(input))), + output: None, }, PortDescription::OutputPipe { name, output } => Self { port_id, name, represents_console: false, - state: PortState::Inactive { - input: None, - output: Some(output), - }, + state: PortState::Inactive, + input: None, + output: Some(Arc::new(Mutex::new(output))), }, } } @@ -114,38 +114,56 @@ impl Port { irq_signaler: IRQSignaler, control: Arc, ) { - let (input, output) = if let PortState::Inactive { input, output } = &mut self.state { - (mem::take(input), mem::take(output)) - } else { - // The threads are already started - return; + if let PortState::Active { .. } = &mut self.state { + self.shutdown(); }; + let input = self.input.as_ref().cloned(); + let output = self.output.as_ref().cloned(); + + let stopfd = utils::eventfd::EventFd::new(utils::eventfd::EFD_NONBLOCK) + .expect("Failed to create EventFd for interrupt_evt"); + let stop = Arc::new(AtomicBool::new(false)); + let rx_thread = input.map(|input| { let mem = mem.clone(); let irq_signaler = irq_signaler.clone(); let port_id = self.port_id; - thread::spawn(move || process_rx(mem, rx_queue, irq_signaler, input, control, port_id)) + let stopfd = stopfd.try_clone().unwrap(); + let stop = stop.clone(); + thread::spawn(move || { + process_rx( + mem, + rx_queue, + irq_signaler, + input, + control, + port_id, + stopfd, + stop, + ) + }) }); - let stop = Arc::new(AtomicBool::new(false)); let tx_thread = output.map(|output| { let stop = stop.clone(); thread::spawn(move || process_tx(mem, tx_queue, irq_signaler, output, stop)) }); self.state = PortState::Active { + stopfd, stop, rx_thread, tx_thread, } } - pub fn flush(&mut self) { + pub fn shutdown(&mut self) { if let PortState::Active { + stopfd, stop, tx_thread, - rx_thread: _, + rx_thread, } = &mut self.state { stop.store(true, Ordering::Release); @@ -158,6 +176,16 @@ impl Port { ) } } + stopfd.write(1).unwrap(); + if let Some(rx_thread) = mem::take(rx_thread) { + rx_thread.thread().unpark(); + if let Err(e) = rx_thread.join() { + log::error!( + "Failed to flush tx for port {port_id}, thread panicked: {e:?}", + port_id = self.port_id + ) + } + } }; } } diff --git a/src/devices/src/virtio/console/port_io.rs b/src/devices/src/virtio/console/port_io.rs index cd0cb4fd..30a01aba 100644 --- a/src/devices/src/virtio/console/port_io.rs +++ b/src/devices/src/virtio/console/port_io.rs @@ -14,7 +14,7 @@ use vm_memory::{VolatileMemoryError, VolatileSlice, WriteVolatile}; pub trait PortInput { fn read_volatile(&mut self, buf: &mut VolatileSlice) -> Result; - fn wait_until_readable(&self); + fn wait_until_readable(&self, stopfd: Option<&EventFd>); } pub trait PortOutput { @@ -85,8 +85,12 @@ impl PortInput for PortInputFd { } } - fn wait_until_readable(&self) { - let mut poll_fds = [PollFd::new(self.as_raw_fd(), PollFlags::POLLIN)]; + fn wait_until_readable(&self, stopfd: Option<&EventFd>) { + let mut poll_fds = Vec::new(); + poll_fds.push(PollFd::new(self.as_raw_fd(), PollFlags::POLLIN)); + if let Some(stopfd) = stopfd { + poll_fds.push(PollFd::new(stopfd.as_raw_fd(), PollFlags::POLLIN)); + } poll(&mut poll_fds, -1).expect("Failed to poll"); } } @@ -213,7 +217,7 @@ impl PortInput for PortInputSigInt { Ok(1) } - fn wait_until_readable(&self) { + fn wait_until_readable(&self, _stopfd: Option<&EventFd>) { let mut poll_fds = [PollFd::new(self.sigint_evt.as_raw_fd(), PollFlags::POLLIN)]; poll(&mut poll_fds, -1).expect("Failed to poll"); } diff --git a/src/devices/src/virtio/console/process_rx.rs b/src/devices/src/virtio/console/process_rx.rs index 06560e77..a0fb2549 100644 --- a/src/devices/src/virtio/console/process_rx.rs +++ b/src/devices/src/virtio/console/process_rx.rs @@ -1,4 +1,5 @@ -use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; use std::{io, thread}; use vm_memory::{GuestMemory, GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion}; @@ -8,17 +9,21 @@ use crate::virtio::console::irq_signaler::IRQSignaler; use crate::virtio::console::port_io::PortInput; use crate::virtio::{DescriptorChain, Queue}; +#[allow(clippy::too_many_arguments)] pub(crate) fn process_rx( mem: GuestMemoryMmap, mut queue: Queue, irq: IRQSignaler, - mut input: Box, + input: Arc>>, control: Arc, port_id: u32, + stopfd: utils::eventfd::EventFd, + stop: Arc, ) { let mem = &mem; let mut eof = false; + let mut input = input.lock().unwrap(); loop { let head = pop_head_blocking(&mut queue, mem, &irq); @@ -52,7 +57,11 @@ pub(crate) fn process_rx( } else if bytes_read == 0 { queue.undo_pop(); irq.signal_used_queue("rx WouldBlock"); - input.wait_until_readable(); + input.wait_until_readable(Some(&stopfd)); + } + + if stop.load(Ordering::Acquire) { + return; } } } diff --git a/src/devices/src/virtio/console/process_tx.rs b/src/devices/src/virtio/console/process_tx.rs index ad89b514..0be302f8 100644 --- a/src/devices/src/virtio/console/process_tx.rs +++ b/src/devices/src/virtio/console/process_tx.rs @@ -1,5 +1,5 @@ use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::{io, thread}; use vm_memory::{GuestMemory, GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion}; @@ -12,7 +12,7 @@ pub(crate) fn process_tx( mem: GuestMemoryMmap, mut queue: Queue, irq: IRQSignaler, - mut output: Box, + output: Arc>>, stop: Arc, ) { loop { @@ -25,7 +25,7 @@ pub(crate) fn process_tx( for desc in head.into_iter().readable() { let desc_len = desc.len as usize; - match write_desc_to_output(desc, output.as_mut(), &irq) { + match write_desc_to_output(desc, output.lock().unwrap().as_mut(), &irq) { Ok(0) => { break; } diff --git a/src/devices/src/virtio/device.rs b/src/devices/src/virtio/device.rs index 73e4ed9a..70b2ce26 100644 --- a/src/devices/src/virtio/device.rs +++ b/src/devices/src/virtio/device.rs @@ -115,8 +115,8 @@ pub trait VirtioDevice: AsAny + Send { /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { - None + fn reset(&mut self) -> bool { + false } /// Get base and size of the SHM region diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index d6d573e2..f2f8084d 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -6,7 +6,9 @@ use std::collections::btree_map; use std::collections::{BTreeMap, HashMap}; use std::ffi::{CStr, CString}; use std::fs::File; -use std::mem::{self, MaybeUninit}; +#[cfg(not(feature = "efi"))] +use std::mem; +use std::mem::MaybeUninit; use std::num::NonZeroUsize; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::path::PathBuf; @@ -36,6 +38,7 @@ const XATTR_KEY: &[u8] = b"user.containers.override_stat\0"; const UID_MAX: u32 = u32::MAX - 1; +#[cfg(not(feature = "efi"))] static INIT_BINARY: &[u8] = include_bytes!("../../../../../../init/init"); type Inode = u64; @@ -1141,9 +1144,10 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("lookup: {:?}", name); - let init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; + let _init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - if self.init_inode != 0 && name == init_name { + #[cfg(not(feature = "efi"))] + if self.init_inode != 0 && name == _init_name { let mut st: bindings::stat64 = unsafe { mem::zeroed() }; st.st_size = INIT_BINARY.len() as i64; st.st_ino = self.init_inode; @@ -1159,6 +1163,8 @@ impl FileSystem for PassthroughFs { } else { self.do_lookup(parent, name) } + #[cfg(feature = "efi")] + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1410,6 +1416,7 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {:?}", inode); + #[cfg(not(feature = "efi"))] if inode == self.init_inode { return w.write(&INIT_BINARY[offset as usize..(offset + (size as u64)) as usize]); } diff --git a/src/devices/src/virtio/mmio.rs b/src/devices/src/virtio/mmio.rs index d2a1cfb3..37c475e1 100644 --- a/src/devices/src/virtio/mmio.rs +++ b/src/devices/src/virtio/mmio.rs @@ -94,13 +94,6 @@ impl MmioTransport { self.device_status & (set | clr) == set } - fn are_queues_valid(&self) -> bool { - self.locked_device() - .queues() - .iter() - .all(|q| q.is_valid(&self.mem)) - } - fn with_queue(&self, d: U, f: F) -> U where F: FnOnce(&Queue) -> U, @@ -129,10 +122,7 @@ impl MmioTransport { } fn update_queue_field(&mut self, f: F) { - if self.check_device_status( - device_status::FEATURES_OK, - device_status::DRIVER_OK | device_status::FAILED, - ) { + if self.check_device_status(device_status::FEATURES_OK, device_status::FAILED) { self.with_queue_mut(f); } else { warn!( @@ -184,7 +174,7 @@ impl MmioTransport { DRIVER_OK if self.device_status == (ACKNOWLEDGE | DRIVER | FEATURES_OK) => { self.device_status = status; let device_activated = self.locked_device().is_activated(); - if !device_activated && self.are_queues_valid() { + if !device_activated { self.locked_device() .activate(self.mem.clone()) .expect("Failed to activate device"); @@ -195,16 +185,8 @@ impl MmioTransport { self.device_status |= FAILED; } _ if status == 0 => { - if self.locked_device().is_activated() { - let mut device_status = self.device_status; - let reset_result = self.locked_device().reset(); - match reset_result { - Some((_interrupt_evt, mut _queue_evts)) => {} - None => { - device_status |= FAILED; - } - } - self.device_status = device_status; + if self.locked_device().is_activated() && !self.locked_device().reset() { + self.device_status |= FAILED; } // If the backend device driver doesn't support reset, @@ -478,9 +460,7 @@ pub(crate) mod tests { #[test] fn test_new() { let m = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x1000)]).unwrap(); - let mut dummy = DummyDevice::new(); - // Validate reset is no-op. - assert!(dummy.reset().is_none()); + let dummy = DummyDevice::new(); let mut d = MmioTransport::new(m, Arc::new(Mutex::new(dummy))); // We just make sure here that the implementation of a mmio device behaves as we expect, @@ -488,8 +468,6 @@ pub(crate) mod tests { assert_eq!(d.locked_device().queue_events().len(), 2); - assert!(!d.are_queues_valid()); - d.queue_select = 0; assert_eq!(d.with_queue(0, Queue::get_max_size), 16); assert!(d.with_queue_mut(|q| q.size = 16)); @@ -503,8 +481,6 @@ pub(crate) mod tests { d.queue_select = 2; assert_eq!(d.with_queue(0, Queue::get_max_size), 0); assert!(!d.with_queue_mut(|q| q.size = 16)); - - assert!(!d.are_queues_valid()); } #[test] @@ -748,7 +724,6 @@ pub(crate) mod tests { let m = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x1000)]).unwrap(); let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new()))); - assert!(!d.are_queues_valid()); assert!(!d.locked_device().is_activated()); assert_eq!(d.device_status, device_status::INIT); @@ -787,7 +762,6 @@ pub(crate) mod tests { write_le_u32(&mut buf[..], 1); d.write(0, 0x44, &buf[..]); } - assert!(d.are_queues_valid()); assert!(!d.locked_device().is_activated()); // Device should be ready for activation now. @@ -812,14 +786,6 @@ pub(crate) mod tests { | device_status::DRIVER_OK ); assert!(d.locked_device().is_activated()); - - // A write which changes the size of a queue after activation; currently only triggers - // a warning path and have no effect on queue state. - write_le_u32(&mut buf[..], 0); - d.queue_select = 0; - d.write(0, 0x44, &buf[..]); - d.read(0, 0x44, &mut buf[..]); - assert_eq!(read_le_u32(&buf[..]), 1); } fn activate_device(d: &mut MmioTransport) { @@ -840,7 +806,6 @@ pub(crate) mod tests { write_le_u32(&mut buf[..], 1); d.write(0, 0x44, &buf[..]); } - assert!(d.are_queues_valid()); assert!(!d.locked_device().is_activated()); // Device should be ready for activation now. @@ -867,7 +832,6 @@ pub(crate) mod tests { let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new()))); let mut buf = [0; 4]; - assert!(!d.are_queues_valid()); assert!(!d.locked_device().is_activated()); assert_eq!(d.device_status, 0); activate_device(&mut d); diff --git a/src/devices/src/virtio/mod.rs b/src/devices/src/virtio/mod.rs index d5963424..f8bdaa4f 100644 --- a/src/devices/src/virtio/mod.rs +++ b/src/devices/src/virtio/mod.rs @@ -12,7 +12,7 @@ use std::io::Error as IOError; #[cfg(not(feature = "tee"))] pub mod balloon; -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] pub mod block; pub mod console; pub mod device; @@ -30,8 +30,8 @@ pub mod vsock; #[cfg(not(feature = "tee"))] pub use self::balloon::*; -#[cfg(feature = "tee")] -pub use self::block::*; +#[cfg(feature = "blk")] +pub use self::block::{Block, CacheType}; pub use self::console::*; pub use self::device::*; #[cfg(not(feature = "tee"))] diff --git a/src/devices/src/virtio/net/device.rs b/src/devices/src/virtio/net/device.rs index ac797a70..7dc7bf80 100644 --- a/src/devices/src/virtio/net/device.rs +++ b/src/devices/src/virtio/net/device.rs @@ -139,6 +139,10 @@ impl Net { &self.id } + pub fn set_intc(&mut self, intc: Arc>) { + self.intc = Some(intc); + } + pub(crate) fn process_rx_queue_event(&mut self) { if let Err(e) = self.queue_evts[RX_INDEX].read() { log::error!("Failed to get rx event from queue: {:?}", e); diff --git a/src/devices/src/virtio/queue.rs b/src/devices/src/virtio/queue.rs index 939354f9..87ccd28c 100644 --- a/src/devices/src/virtio/queue.rs +++ b/src/devices/src/virtio/queue.rs @@ -308,7 +308,7 @@ impl Queue { /// Pop the first available descriptor chain from the avail ring. pub fn pop<'b>(&mut self, mem: &'b GuestMemoryMmap) -> Option> { - if self.len(mem) == 0 { + if self.len(mem) == 0 || self.actual_size() == 0 { return None; } diff --git a/src/devices/src/virtio/rng/device.rs b/src/devices/src/virtio/rng/device.rs index 171fb6d2..4b6f4da7 100644 --- a/src/devices/src/virtio/rng/device.rs +++ b/src/devices/src/virtio/rng/device.rs @@ -201,4 +201,12 @@ impl VirtioDevice for Rng { DeviceState::Activated(_) => true, } } + + fn reset(&mut self) -> bool { + // Strictly speaking, we should unsubscribe the queue events resubscribe + // the activate eventfd and deactivate the device, but we don't support + // any scenario in which neither GuestMemory nor the queue events would + // change, so let's avoid doing any unnecessary work. + true + } } diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index aca84eb6..fd89c60f 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -7,8 +7,10 @@ build = "build.rs" [features] tee = [] -amd-sev = [ "tee" ] +amd-sev = [ "blk", "tee" ] net = [] +blk = [] +efi = [ "blk", "net" ] [dependencies] env_logger = "0.9.0" diff --git a/src/libkrun/build.rs b/src/libkrun/build.rs index ce74b29b..a3ccc228 100644 --- a/src/libkrun/build.rs +++ b/src/libkrun/build.rs @@ -3,7 +3,7 @@ fn main() { println!("cargo:rustc-link-lib=framework=Hypervisor"); #[cfg(target_os = "macos")] println!("cargo:rustc-link-search=/opt/homebrew/lib"); - #[cfg(not(feature = "tee"))] + #[cfg(all(not(feature = "tee"), not(feature = "efi")))] println!("cargo:rustc-link-lib=krunfw"); #[cfg(feature = "tee")] println!("cargo:rustc-link-lib=krunfw-sev"); diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 6fe0e9e1..480feca5 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -18,18 +18,21 @@ use std::slice; use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::Mutex; -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] use devices::virtio::CacheType; use env_logger::Env; -use libc::{c_char, c_int, size_t}; +#[cfg(not(feature = "efi"))] +use libc::size_t; +use libc::{c_char, c_int}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; use vmm::resources::VmResources; -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] use vmm::vmm_config::block::BlockDeviceConfig; use vmm::vmm_config::boot_source::{BootSourceConfig, DEFAULT_KERNEL_CMDLINE}; #[cfg(not(feature = "tee"))] use vmm::vmm_config::fs::FsDeviceConfig; +#[cfg(not(feature = "efi"))] use vmm::vmm_config::kernel_bundle::KernelBundle; #[cfg(feature = "tee")] use vmm::vmm_config::kernel_bundle::{InitrdBundle, QbootBundle}; @@ -39,6 +42,7 @@ use vmm::vmm_config::net::NetworkInterfaceConfig; use vmm::vmm_config::vsock::VsockDeviceConfig; // Minimum krunfw version we require. +#[cfg(not(feature = "efi"))] const KRUNFW_MIN_VERSION: u32 = 4; // Value returned on success. We use libc's errors otherwise. const KRUN_SUCCESS: i32 = 0; @@ -82,9 +86,9 @@ struct ContextConfig { net_cfg: NetworkConfig, #[cfg(not(feature = "tee"))] fs_cfg: Option, - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] root_block_cfg: Option, - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] data_block_cfg: Option, #[cfg(feature = "tee")] tee_config_file: Option, @@ -156,22 +160,22 @@ impl ContextConfig { self.fs_cfg.clone() } - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] fn set_root_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { self.root_block_cfg = Some(block_cfg); } - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] fn get_root_block_cfg(&self) -> Option { self.root_block_cfg.clone() } - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] fn set_data_block_cfg(&mut self, block_cfg: BlockDeviceConfig) { self.data_block_cfg = Some(block_cfg); } - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] fn get_data_block_cfg(&self) -> Option { self.data_block_cfg.clone() } @@ -206,7 +210,7 @@ impl ContextConfig { static CTX_MAP: Lazy>> = Lazy::new(|| Mutex::new(HashMap::new())); static CTX_IDS: AtomicI32 = AtomicI32::new(0); -#[cfg(not(feature = "tee"))] +#[cfg(all(not(feature = "tee"), not(feature = "efi")))] #[link(name = "krunfw")] extern "C" { fn krunfw_get_kernel( @@ -245,6 +249,7 @@ pub extern "C" fn krun_set_log_level(level: u32) -> i32 { } #[no_mangle] +#[cfg(not(feature = "efi"))] pub extern "C" fn krun_create_ctx() -> i32 { let krunfw_version = unsafe { krunfw_get_version() }; if krunfw_version < KRUNFW_MIN_VERSION { @@ -302,6 +307,21 @@ pub extern "C" fn krun_create_ctx() -> i32 { ctx_id } +#[no_mangle] +#[cfg(feature = "efi")] +pub extern "C" fn krun_create_ctx() -> i32 { + let ctx_cfg = ContextConfig::default(); + + let ctx_id = CTX_IDS.fetch_add(1, Ordering::SeqCst); + if ctx_id == i32::MAX || CTX_MAP.lock().unwrap().contains_key(&(ctx_id as u32)) { + // libkrun is not intended to be used as a daemon for managing VMs. + panic!("Context ID namespace exhausted"); + } + CTX_MAP.lock().unwrap().insert(ctx_id as u32, ctx_cfg); + + ctx_id +} + #[no_mangle] pub extern "C" fn krun_free_ctx(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().remove(&ctx_id) { @@ -435,7 +455,7 @@ pub unsafe extern "C" fn krun_set_mapped_volumes( #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] pub unsafe extern "C" fn krun_set_root_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { let disk_path = match CStr::from_ptr(c_disk_path).to_str() { Ok(disk) => disk, @@ -465,7 +485,7 @@ pub unsafe extern "C" fn krun_set_root_disk(ctx_id: u32, c_disk_path: *const c_c #[allow(clippy::missing_safety_doc)] #[no_mangle] -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] pub unsafe extern "C" fn krun_set_data_disk(ctx_id: u32, c_disk_path: *const c_char) -> i32 { let disk_path = match CStr::from_ptr(c_disk_path).to_str() { Ok(disk) => disk, @@ -778,7 +798,7 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { } } - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] if let Some(block_cfg) = ctx_cfg.get_root_block_cfg() { if ctx_cfg.vmr.add_block_device(block_cfg).is_err() { error!("Error configuring virtio-blk for root block"); @@ -786,7 +806,7 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { } } - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] if let Some(block_cfg) = ctx_cfg.get_data_block_cfg() { if ctx_cfg.vmr.add_block_device(block_cfg).is_err() { error!("Error configuring virtio-blk for data block"); diff --git a/src/utils/src/macos/epoll.rs b/src/utils/src/macos/epoll.rs index ead66377..1ec7cc71 100644 --- a/src/utils/src/macos/epoll.rs +++ b/src/utils/src/macos/epoll.rs @@ -145,8 +145,8 @@ impl Epoll { match operation { ControlOperation::Add | ControlOperation::Modify => { let mut kevs: Vec = Vec::new(); - let oneshot = if eset.contains(EventSet::EDGE_TRIGGERED) { - libc::EV_ONESHOT + let clear = if eset.contains(EventSet::EDGE_TRIGGERED) { + libc::EV_CLEAR } else { 0 }; @@ -155,7 +155,7 @@ impl Epoll { kevs.push(Kevent::new( fd as usize, libc::EVFILT_READ, - libc::EV_ADD | oneshot, + libc::EV_ADD | clear, event.u64, )); } @@ -164,7 +164,7 @@ impl Epoll { kevs.push(Kevent::new( fd as usize, libc::EVFILT_WRITE, - libc::EV_ADD | oneshot, + libc::EV_ADD | clear, event.u64, )); } @@ -271,13 +271,13 @@ impl Epoll { events[i as usize].events = EventSet::OUT.bits(); } if kevs[i as usize].0.flags & libc::EV_EOF != 0 { - events[i as usize].events |= if kevs[i as usize].0.flags & libc::EV_ONESHOT != 0 { + events[i as usize].events |= if kevs[i as usize].0.flags & libc::EV_CLEAR != 0 { EventSet::READ_HANG_UP.bits() } else { EventSet::HANG_UP.bits() }; } - events[i as usize].u64 = kevs[i as usize].udata() as u64; + events[i as usize].u64 = kevs[i as usize].udata(); } match ret { diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 106acea7..460989b4 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -6,8 +6,10 @@ edition = "2021" [features] tee = [] -amd-sev = [ "tee", "codicon", "kbs-types", "procfs", "serde", "serde_json", "sev", "curl" ] +amd-sev = [ "blk", "tee", "codicon", "kbs-types", "procfs", "serde", "serde_json", "sev", "curl" ] net = [] +blk = [] +efi = [ "blk", "net" ] [dependencies] crossbeam-channel = "0.5" diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5f0d9866..78e190b8 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -7,6 +7,7 @@ use crossbeam_channel::unbounded; use std::fmt::{Display, Formatter}; use std::io; +#[cfg(target_os = "linux")] use std::os::fd::AsRawFd; use std::sync::{Arc, Mutex}; @@ -34,7 +35,7 @@ use crate::signal_handler::register_sigint_handler; #[cfg(target_os = "linux")] use crate::signal_handler::register_sigwinch_handler; use crate::terminal::term_set_raw_mode; -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] use crate::vmm_config::block::BlockBuilder; use crate::vmm_config::boot_source::DEFAULT_KERNEL_CMDLINE; #[cfg(not(feature = "tee"))] @@ -58,11 +59,16 @@ use utils::eventfd::EventFd; use utils::time::TimestampUs; #[cfg(all(target_os = "linux", target_arch = "x86_64", not(feature = "tee")))] use vm_memory::mmap::GuestRegionMmap; +#[cfg(not(feature = "efi"))] +use vm_memory::mmap::MmapRegion; #[cfg(any(target_arch = "aarch64", feature = "tee"))] use vm_memory::Bytes; #[cfg(target_os = "linux")] use vm_memory::GuestMemory; -use vm_memory::{mmap::MmapRegion, GuestAddress, GuestMemoryMmap}; +use vm_memory::{GuestAddress, GuestMemoryMmap}; + +#[cfg(feature = "efi")] +static EDK2_BINARY: &[u8] = include_bytes!("../../../edk2/KRUN_EFI.silent.fd"); /// Errors associated with starting the instance. #[derive(Debug)] @@ -269,9 +275,11 @@ pub fn build_microvm( // Timestamp for measuring microVM boot duration. let request_ts = TimestampUs::default(); + #[cfg(not(feature = "efi"))] let kernel_bundle = vm_resources .kernel_bundle() .ok_or(StartMicrovmError::MissingKernelConfig)?; + #[cfg(not(feature = "efi"))] let kernel_region = unsafe { MmapRegion::build_raw(kernel_bundle.host_addr as *mut u8, kernel_bundle.size, 0, 0) .map_err(StartMicrovmError::KernelBundle)? @@ -292,8 +300,11 @@ pub fn build_microvm( .vm_config() .mem_size_mib .ok_or(StartMicrovmError::MissingMemSizeConfig)?, + #[cfg(not(feature = "efi"))] kernel_region, + #[cfg(not(feature = "efi"))] kernel_bundle.guest_addr, + #[cfg(not(feature = "efi"))] kernel_bundle.size, #[cfg(feature = "tee")] qboot_bundle, @@ -384,21 +395,15 @@ pub fn build_microvm( // On x86_64 always create a serial device, // while on aarch64 only create it if 'console=' is specified in the boot args. - /* - let serial_device = if cfg!(target_arch = "x86_64") - || (cfg!(target_arch = "aarch64") && kernel_cmdline.as_str().contains("console=")) - { + let serial_device = if cfg!(feature = "efi") { Some(setup_serial_device( event_manager, - Box::new(SerialStdin::get()), - Box::new(io::stdout()), + None, + Some(Box::new(io::stdout())), )?) } else { None }; - */ - - let serial_device = None; let exit_evt = EventFd::new(utils::eventfd::EFD_NONBLOCK) .map_err(Error::EventFd) @@ -483,11 +488,16 @@ pub fn build_microvm( #[cfg(all(target_arch = "aarch64", target_os = "macos"))] { + #[cfg(not(feature = "efi"))] + let start_addr = GuestAddress(kernel_bundle.guest_addr); + #[cfg(feature = "efi")] + let start_addr = GuestAddress(0u64); + vcpus = create_vcpus_aarch64( &vm, &vcpu_config, &guest_memory, - GuestAddress(kernel_bundle.guest_addr), + start_addr, request_ts, &exit_evt, intc.clone().unwrap(), @@ -541,15 +551,20 @@ pub fn build_microvm( shm_region, intc.clone(), )?; - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] attach_block_devices(&mut vmm, &vm_resources.block, event_manager, intc.clone())?; if let Some(vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device(&mut vmm, vsock, event_manager, intc)?; + attach_unixsock_vsock_device(&mut vmm, vsock, event_manager, intc.clone())?; vmm.kernel_cmdline.insert_str("tsi_hijack")?; } #[cfg(feature = "net")] - attach_net_devices(&mut vmm, vm_resources.net_builder.iter(), event_manager)?; + attach_net_devices( + &mut vmm, + vm_resources.net_builder.iter(), + event_manager, + intc, + )?; if let Some(s) = &vm_resources.boot_config.kernel_cmdline_epilog { vmm.kernel_cmdline.insert_str(s).unwrap(); @@ -681,7 +696,7 @@ pub fn create_guest_memory( Ok((guest_mem, arch_mem_info)) } -#[cfg(target_arch = "aarch64")] +#[cfg(all(target_arch = "aarch64", not(feature = "efi")))] pub fn create_guest_memory( mem_size_mib: usize, kernel_region: MmapRegion, @@ -701,6 +716,20 @@ pub fn create_guest_memory( Ok((guest_mem, arch_mem_info)) } +#[cfg(all(target_arch = "aarch64", feature = "efi"))] +pub fn create_guest_memory( + mem_size_mib: usize, +) -> std::result::Result<(GuestMemoryMmap, ArchMemoryInfo), StartMicrovmError> { + let mem_size = mem_size_mib << 20; + let (arch_mem_info, arch_mem_regions) = arch::arch_memory_regions(mem_size); + + let guest_mem = GuestMemoryMmap::from_ranges(&arch_mem_regions) + .map_err(StartMicrovmError::GuestMemoryMmap)?; + + guest_mem.write(EDK2_BINARY, GuestAddress(0u64)).unwrap(); + Ok((guest_mem, arch_mem_info)) +} + #[cfg(all(target_arch = "x86_64", not(feature = "tee")))] fn load_cmdline(vmm: &Vmm) -> std::result::Result<(), StartMicrovmError> { kernel::loader::load_cmdline( @@ -777,20 +806,23 @@ pub fn setup_interrupt_controller( /// Sets up the serial device. pub fn setup_serial_device( event_manager: &mut EventManager, - input: Box, - out: Box, + input: Option>, + out: Option>, ) -> std::result::Result>, StartMicrovmError> { let interrupt_evt = EventFd::new(utils::eventfd::EFD_NONBLOCK) .map_err(Error::EventFd) .map_err(StartMicrovmError::Internal)?; - let serial = Arc::new(Mutex::new(Serial::new_in_out(interrupt_evt, input, out))); - if let Err(e) = event_manager.add_subscriber(serial.clone()) { - // TODO: We just log this message, and immediately return Ok, instead of returning the - // actual error because this operation always fails with EPERM when adding a fd which - // has been redirected to /dev/null via dup2 (this may happen inside the jailer). - // Find a better solution to this (and think about the state of the serial device - // while we're at it). - warn!("Could not add serial input event to epoll: {:?}", e); + let has_input = input.is_some(); + let serial = Arc::new(Mutex::new(Serial::new(interrupt_evt, out, input))); + if has_input { + if let Err(e) = event_manager.add_subscriber(serial.clone()) { + // TODO: We just log this message, and immediately return Ok, instead of returning the + // actual error because this operation always fails with EPERM when adding a fd which + // has been redirected to /dev/null via dup2 (this may happen inside the jailer). + // Find a better solution to this (and think about the state of the serial device + // while we're at it). + warn!("Could not add serial input event to epoll: {:?}", e); + } } Ok(serial) } @@ -1133,9 +1165,15 @@ fn attach_net_devices<'a>( vmm: &mut Vmm, net_devices: impl Iterator>>, event_manager: &mut EventManager, + intc: Option>>, ) -> Result<(), StartMicrovmError> { for net_device in net_devices { let id = net_device.lock().unwrap().id().to_string(); + + if let Some(ref intc) = intc { + net_device.lock().unwrap().set_intc(intc.clone()); + } + event_manager .add_subscriber(net_device.clone()) .map_err(StartMicrovmError::RegisterEvent)?; @@ -1210,7 +1248,7 @@ fn attach_balloon_device( Ok(()) } -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] fn attach_block_devices( vmm: &mut Vmm, block_devs: &BlockBuilder, diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index b227f980..0aa29d19 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -16,7 +16,7 @@ use serde::{Deserialize, Serialize}; #[cfg(feature = "tee")] use kbs_types::Tee; -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] use crate::vmm_config::block::{BlockBuilder, BlockConfigError, BlockDeviceConfig}; use crate::vmm_config::boot_source::{BootSourceConfig, BootSourceConfigError}; #[cfg(not(feature = "tee"))] @@ -101,7 +101,7 @@ pub struct VmResources { /// The vsock device. pub vsock: VsockBuilder, /// The virtio-blk device. - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] pub block: BlockBuilder, /// The network devices builder. #[cfg(feature = "net")] @@ -232,7 +232,7 @@ impl VmResources { self.fs.insert(config) } - #[cfg(feature = "tee")] + #[cfg(feature = "blk")] pub fn add_block_device(&mut self, config: BlockDeviceConfig) -> Result { self.block.insert(config) } diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 2fbdb25d..8f772b17 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 /// Wrapper for configuring the Block devices attached to the microVM. -#[cfg(feature = "tee")] +#[cfg(feature = "blk")] pub mod block; /// Wrapper for configuring the microVM boot source.