From 5702761073f602033654be6d353215457d645d96 Mon Sep 17 00:00:00 2001 From: tiif Date: Wed, 14 Aug 2024 18:45:29 +0800 Subject: [PATCH] Implement epoll shim --- src/helpers.rs | 8 + src/machine.rs | 5 + src/shims/mod.rs | 2 +- src/shims/unix/fd.rs | 107 +++++- src/shims/unix/fs.rs | 4 + src/shims/unix/linux/epoll.rs | 382 ++++++++++++++++++--- src/shims/unix/linux/eventfd.rs | 51 ++- src/shims/unix/mod.rs | 1 + src/shims/unix/socket.rs | 91 ++++- tests/fail-dep/tokio/sleep.rs | 2 +- tests/fail-dep/tokio/sleep.stderr | 4 +- tests/pass-dep/libc/libc-epoll.rs | 552 ++++++++++++++++++++++++++++++ 12 files changed, 1134 insertions(+), 75 deletions(-) create mode 100644 tests/pass-dep/libc/libc-epoll.rs diff --git a/src/helpers.rs b/src/helpers.rs index 1bdf9f06dc..0483745621 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -371,6 +371,14 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { path_ty_layout(this, &["std", "sys", "pal", "windows", "c", name]) } + /// Helper function to get `TyAndLayout` of an array that consists of `libc` type. + fn libc_array_ty_layout(&self, name: &str, size: u64) -> TyAndLayout<'tcx> { + let this = self.eval_context_ref(); + let elem_ty_layout = this.libc_ty_layout(name); + let array_ty = Ty::new_array(*this.tcx, elem_ty_layout.ty, size); + this.layout_of(array_ty).unwrap() + } + /// Project to the given *named* field (which must be a struct or union type). fn project_field_named>( &self, diff --git a/src/machine.rs b/src/machine.rs index 94598e7d2e..2d8b8cb71d 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -453,6 +453,9 @@ pub struct MiriMachine<'tcx> { /// The table of directory descriptors. pub(crate) dirs: shims::DirTable, + /// The list of all EpollEventInterest. + pub(crate) epoll_interests: shims::EpollInterestTable, + /// This machine's monotone clock. pub(crate) clock: Clock, @@ -647,6 +650,7 @@ impl<'tcx> MiriMachine<'tcx> { isolated_op: config.isolated_op, validation: config.validation, fds: shims::FdTable::init(config.mute_stdout_stderr), + epoll_interests: shims::EpollInterestTable::new(), dirs: Default::default(), layouts, threads, @@ -785,6 +789,7 @@ impl VisitProvenance for MiriMachine<'_> { data_race, alloc_addresses, fds, + epoll_interests:_, tcx: _, isolated_op: _, validation: _, diff --git a/src/shims/mod.rs b/src/shims/mod.rs index a41a2883c9..7d5349f26b 100644 --- a/src/shims/mod.rs +++ b/src/shims/mod.rs @@ -17,7 +17,7 @@ pub mod panic; pub mod time; pub mod tls; -pub use unix::{DirTable, FdTable}; +pub use unix::{DirTable, EpollInterestTable, FdTable}; /// What needs to be done after emulating an item (a shim or an intrinsic) is done. pub enum EmulateItemResult { diff --git a/src/shims/unix/fd.rs b/src/shims/unix/fd.rs index 1b25ef0576..98a124b9a5 100644 --- a/src/shims/unix/fd.rs +++ b/src/shims/unix/fd.rs @@ -6,9 +6,11 @@ use std::cell::{Ref, RefCell, RefMut}; use std::collections::BTreeMap; use std::io::{self, ErrorKind, IsTerminal, Read, SeekFrom, Write}; use std::rc::Rc; +use std::rc::Weak; use rustc_target::abi::Size; +use crate::shims::unix::linux::epoll::EpollReadyEvents; use crate::shims::unix::*; use crate::*; @@ -27,6 +29,7 @@ pub trait FileDescription: std::fmt::Debug + Any { fn read<'tcx>( &mut self, _communicate_allowed: bool, + _fd_id: FdId, _bytes: &mut [u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -37,6 +40,7 @@ pub trait FileDescription: std::fmt::Debug + Any { fn write<'tcx>( &mut self, _communicate_allowed: bool, + _fd_id: FdId, _bytes: &[u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -80,6 +84,7 @@ pub trait FileDescription: std::fmt::Debug + Any { fn close<'tcx>( self: Box, _communicate_allowed: bool, + _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result<()>> { throw_unsup_format!("cannot close {}", self.name()); } @@ -97,6 +102,11 @@ pub trait FileDescription: std::fmt::Debug + Any { // so we use a default impl here. false } + + /// Check the readiness of file description. + fn get_epoll_ready_events<'tcx>(&self) -> InterpResult<'tcx, EpollReadyEvents> { + throw_unsup_format!("{}: epoll does not support this file description", self.name()); + } } impl dyn FileDescription { @@ -119,6 +129,7 @@ impl FileDescription for io::Stdin { fn read<'tcx>( &mut self, communicate_allowed: bool, + _fd_id: FdId, bytes: &mut [u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -142,6 +153,7 @@ impl FileDescription for io::Stdout { fn write<'tcx>( &mut self, _communicate_allowed: bool, + _fd_id: FdId, bytes: &[u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -170,6 +182,7 @@ impl FileDescription for io::Stderr { fn write<'tcx>( &mut self, _communicate_allowed: bool, + _fd_id: FdId, bytes: &[u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -195,6 +208,7 @@ impl FileDescription for NullOutput { fn write<'tcx>( &mut self, _communicate_allowed: bool, + _fd_id: FdId, bytes: &[u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -203,36 +217,98 @@ impl FileDescription for NullOutput { } } +/// Structure contains both the file description and its unique identifier. +#[derive(Clone, Debug)] +pub struct FileDescWithId { + id: FdId, + file_description: RefCell>, +} + #[derive(Clone, Debug)] -pub struct FileDescriptionRef(Rc>>); +pub struct FileDescriptionRef(Rc>); impl FileDescriptionRef { - fn new(fd: impl FileDescription) -> Self { - FileDescriptionRef(Rc::new(RefCell::new(Box::new(fd)))) + fn new(fd: impl FileDescription, id: FdId) -> Self { + FileDescriptionRef(Rc::new(FileDescWithId { + id, + file_description: RefCell::new(Box::new(fd)), + })) } pub fn borrow(&self) -> Ref<'_, dyn FileDescription> { - Ref::map(self.0.borrow(), |fd| fd.as_ref()) + Ref::map(self.0.file_description.borrow(), |fd| fd.as_ref()) } pub fn borrow_mut(&self) -> RefMut<'_, dyn FileDescription> { - RefMut::map(self.0.borrow_mut(), |fd| fd.as_mut()) + RefMut::map(self.0.file_description.borrow_mut(), |fd| fd.as_mut()) } - pub fn close<'ctx>(self, communicate_allowed: bool) -> InterpResult<'ctx, io::Result<()>> { + pub fn close<'tcx>( + self, + communicate_allowed: bool, + ecx: &mut MiriInterpCx<'tcx>, + ) -> InterpResult<'tcx, io::Result<()>> { // Destroy this `Rc` using `into_inner` so we can call `close` instead of // implicitly running the destructor of the file description. + let id = self.get_id(); match Rc::into_inner(self.0) { - Some(fd) => RefCell::into_inner(fd).close(communicate_allowed), + Some(fd) => { + // Remove entry from the global epoll_event_interest table. + ecx.machine.epoll_interests.remove(id); + + RefCell::into_inner(fd.file_description).close(communicate_allowed, ecx) + } None => Ok(Ok(())), } } + + pub fn downgrade(&self) -> WeakFileDescriptionRef { + WeakFileDescriptionRef { weak_ref: Rc::downgrade(&self.0) } + } + + pub fn get_id(&self) -> FdId { + self.0.id + } + + /// Function used to retrieve the readiness events of a file description and insert + /// an `EpollEventInstance` into the ready list if the file description is ready. + pub(crate) fn check_and_update_readiness<'tcx>( + &self, + ecx: &mut InterpCx<'tcx, MiriMachine<'tcx>>, + ) -> InterpResult<'tcx, ()> { + use crate::shims::unix::linux::epoll::EvalContextExt; + ecx.check_and_update_readiness(self.get_id(), || self.borrow_mut().get_epoll_ready_events()) + } +} + +/// Holds a weak reference to the actual file description. +#[derive(Clone, Debug, Default)] +pub struct WeakFileDescriptionRef { + weak_ref: Weak>, +} + +impl WeakFileDescriptionRef { + pub fn upgrade(&self) -> Option { + if let Some(file_desc_with_id) = self.weak_ref.upgrade() { + return Some(FileDescriptionRef(file_desc_with_id)); + } + None + } } +/// A unique id for file descriptions. While we could use the address, considering that +/// is definitely unique, the address would expose interpreter internal state when used +/// for sorting things. So instead we generate a unique id per file description that stays +/// the same even if a file descriptor is duplicated and gets a new integer file descriptor. +#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Ord, PartialOrd)] +pub struct FdId(usize); + /// The file descriptor table #[derive(Debug)] pub struct FdTable { - fds: BTreeMap, + pub fds: BTreeMap, + /// Unique identifier for file description, used to differentiate between various file description. + next_file_description_id: FdId, } impl VisitProvenance for FdTable { @@ -243,7 +319,7 @@ impl VisitProvenance for FdTable { impl FdTable { fn new() -> Self { - FdTable { fds: BTreeMap::new() } + FdTable { fds: BTreeMap::new(), next_file_description_id: FdId(0) } } pub(crate) fn init(mute_stdout_stderr: bool) -> FdTable { let mut fds = FdTable::new(); @@ -260,7 +336,8 @@ impl FdTable { /// Insert a new file description to the FdTable. pub fn insert_new(&mut self, fd: impl FileDescription) -> i32 { - let file_handle = FileDescriptionRef::new(fd); + let file_handle = FileDescriptionRef::new(fd, self.next_file_description_id); + self.next_file_description_id = FdId(self.next_file_description_id.0.strict_add(1)); self.insert_ref_with_min_fd(file_handle, 0) } @@ -337,7 +414,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { // If old_fd and new_fd point to the same description, then `dup_fd` ensures we keep the underlying file description alive. if let Some(file_description) = this.machine.fds.fds.insert(new_fd, dup_fd) { // Ignore close error (not interpreter's) according to dup2() doc. - file_description.close(this.machine.communicate())?.ok(); + file_description.close(this.machine.communicate(), this)?.ok(); } } Ok(Scalar::from_i32(new_fd)) @@ -442,7 +519,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { let Some(file_description) = this.machine.fds.remove(fd) else { return Ok(Scalar::from_i32(this.fd_not_found()?)); }; - let result = file_description.close(this.machine.communicate())?; + let result = file_description.close(this.machine.communicate(), this)?; // return `0` if close is successful let result = result.map(|()| 0i32); Ok(Scalar::from_i32(this.try_unwrap_io_result(result)?)) @@ -499,7 +576,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { // `usize::MAX` because it is bounded by the host's `isize`. let mut bytes = vec![0; usize::try_from(count).unwrap()]; let result = match offset { - None => fd.borrow_mut().read(communicate, &mut bytes, this), + None => fd.borrow_mut().read(communicate, fd.get_id(), &mut bytes, this), Some(offset) => { let Ok(offset) = u64::try_from(offset) else { let einval = this.eval_libc("EINVAL"); @@ -509,7 +586,6 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { fd.borrow_mut().pread(communicate, &mut bytes, offset, this) } }; - drop(fd); // `File::read` never returns a value larger than `count`, so this cannot fail. match result?.map(|c| i64::try_from(c).unwrap()) { @@ -558,7 +634,7 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { }; let result = match offset { - None => fd.borrow_mut().write(communicate, &bytes, this), + None => fd.borrow_mut().write(communicate, fd.get_id(), &bytes, this), Some(offset) => { let Ok(offset) = u64::try_from(offset) else { let einval = this.eval_libc("EINVAL"); @@ -568,7 +644,6 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { fd.borrow_mut().pwrite(communicate, &bytes, offset, this) } }; - drop(fd); let result = result?.map(|c| i64::try_from(c).unwrap()); Ok(Scalar::from_target_isize(this.try_unwrap_io_result(result)?, this)) diff --git a/src/shims/unix/fs.rs b/src/shims/unix/fs.rs index d93374db81..9da36e64a0 100644 --- a/src/shims/unix/fs.rs +++ b/src/shims/unix/fs.rs @@ -12,6 +12,7 @@ use rustc_data_structures::fx::FxHashMap; use rustc_target::abi::Size; use crate::shims::os_str::bytes_to_os_str; +use crate::shims::unix::fd::FdId; use crate::shims::unix::*; use crate::*; use shims::time::system_time_to_duration; @@ -32,6 +33,7 @@ impl FileDescription for FileHandle { fn read<'tcx>( &mut self, communicate_allowed: bool, + _fd_id: FdId, bytes: &mut [u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -42,6 +44,7 @@ impl FileDescription for FileHandle { fn write<'tcx>( &mut self, communicate_allowed: bool, + _fd_id: FdId, bytes: &[u8], _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -109,6 +112,7 @@ impl FileDescription for FileHandle { fn close<'tcx>( self: Box, communicate_allowed: bool, + _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result<()>> { assert!(communicate_allowed, "isolation should have prevented even opening a file"); // We sync the file if it was opened in a mode different than read-only. diff --git a/src/shims/unix/linux/epoll.rs b/src/shims/unix/linux/epoll.rs index 9127db3d00..89616bd0d0 100644 --- a/src/shims/unix/linux/epoll.rs +++ b/src/shims/unix/linux/epoll.rs @@ -1,32 +1,103 @@ +use std::cell::RefCell; +use std::collections::BTreeMap; use std::io; +use std::rc::{Rc, Weak}; -use rustc_data_structures::fx::FxHashMap; - +use crate::shims::unix::fd::FdId; use crate::shims::unix::*; use crate::*; /// An `Epoll` file descriptor connects file handles and epoll events #[derive(Clone, Debug, Default)] struct Epoll { - /// The file descriptors we are watching, and what we are watching for. - file_descriptors: FxHashMap, + /// A map of EpollEventInterests registered under this epoll instance. + /// Each entry is differentiated using FdId and file descriptor value. + interest_list: BTreeMap<(FdId, i32), Rc>>, + /// A map of EpollEventInstance that will be returned when `epoll_wait` is called. + /// Similar to interest_list, the entry is also differentiated using FdId + /// and file descriptor value. + // This is an Rc because EpollInterest need to hold a reference to update + // it. + ready_list: Rc>>, } -/// Epoll Events associate events with data. -/// These fields are currently unused by miri. -/// This matches the `epoll_event` struct defined +/// EpollEventInstance contains information that will be returned by epoll_wait. +#[derive(Debug)] +pub struct EpollEventInstance { + /// Xor-ed event types that happened to the file description. + events: u32, + /// Original data retrieved from `epoll_event` during `epoll_ctl`. + data: u64, +} + +impl EpollEventInstance { + pub fn new(events: u32, data: u64) -> EpollEventInstance { + EpollEventInstance { events, data } + } +} +/// EpollEventInterest registers the file description information to an epoll +/// instance during a successful `epoll_ctl` call. It also stores additional +/// information needed to check and update readiness state for `epoll_wait`. +/// +/// `events` and `data` field matches the `epoll_event` struct defined /// by the epoll_ctl man page. For more information /// see the man page: /// /// #[derive(Clone, Debug)] -struct EpollEvent { - #[allow(dead_code)] +pub struct EpollEventInterest { + /// The file descriptor value of the file description registered. + file_descriptor: i32, + /// The events bitmask retrieved from `epoll_event`. events: u32, - /// `Scalar` is used to represent the - /// `epoll_data` type union. - #[allow(dead_code)] - data: Scalar, + /// The data retrieved from `epoll_event`. + /// libc's data field in epoll_event can store integer or pointer, + /// but only u64 is supported for now. + /// + data: u64, + /// Ready list of the epoll instance under which this EpollEventInterest is registered. + ready_list: Rc>>, +} + +/// EpollReadyEvents reflects the readiness of a file description. +pub struct EpollReadyEvents { + /// The associated file is available for read(2) operations. + pub epollin: bool, + /// The associated file is available for write(2) operations. + pub epollout: bool, + /// Stream socket peer closed connection, or shut down writing + /// half of connection. + pub epollrdhup: bool, +} + +impl EpollReadyEvents { + pub fn new() -> Self { + EpollReadyEvents { epollin: false, epollout: false, epollrdhup: false } + } + + pub fn get_event_bitmask<'tcx>(&self, ecx: &MiriInterpCx<'tcx>) -> u32 { + let epollin = ecx.eval_libc_u32("EPOLLIN"); + let epollout = ecx.eval_libc_u32("EPOLLOUT"); + let epollrdhup = ecx.eval_libc_u32("EPOLLRDHUP"); + + let mut bitmask = 0; + if self.epollin { + bitmask |= epollin; + } + if self.epollout { + bitmask |= epollout; + } + if self.epollrdhup { + bitmask |= epollrdhup; + } + bitmask + } +} + +impl Epoll { + fn get_ready_list(&self) -> Rc>> { + Rc::clone(&self.ready_list) + } } impl FileDescription for Epoll { @@ -37,11 +108,51 @@ impl FileDescription for Epoll { fn close<'tcx>( self: Box, _communicate_allowed: bool, + _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result<()>> { Ok(Ok(())) } } +/// The table of all EpollEventInterest. +/// The BTreeMap key is the FdId of an active file description registered with +/// any epoll instance. The value is a list of EpollEventInterest associated +/// with that file description. +pub struct EpollInterestTable(BTreeMap>>>); + +impl EpollInterestTable { + pub(crate) fn new() -> Self { + EpollInterestTable(BTreeMap::new()) + } + + pub fn insert_epoll_interest(&mut self, id: FdId, fd: Weak>) { + match self.0.get_mut(&id) { + Some(fds) => { + fds.push(fd); + } + None => { + let vec = vec![fd]; + self.0.insert(id, vec); + } + } + } + + pub fn get_epoll_interest(&self, id: FdId) -> Option<&Vec>>> { + self.0.get(&id) + } + + pub fn get_epoll_interest_mut( + &mut self, + id: FdId, + ) -> Option<&mut Vec>>> { + self.0.get_mut(&id) + } + + pub fn remove(&mut self, id: FdId) { + self.0.remove(&id); + } +} + impl<'tcx> EvalContextExt<'tcx> for crate::MiriInterpCx<'tcx> {} pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { /// This function returns a file descriptor referring to the new `Epoll` instance. This file @@ -64,6 +175,9 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { ); } + let mut epoll_instance = Epoll::default(); + epoll_instance.ready_list = Rc::new(RefCell::new(BTreeMap::new())); + let fd = this.machine.fds.insert_new(Epoll::default()); Ok(Scalar::from_i32(fd)) } @@ -90,48 +204,143 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { ) -> InterpResult<'tcx, Scalar> { let this = self.eval_context_mut(); - let epfd = this.read_scalar(epfd)?.to_i32()?; + let epfd_value = this.read_scalar(epfd)?.to_i32()?; let op = this.read_scalar(op)?.to_i32()?; let fd = this.read_scalar(fd)?.to_i32()?; - let _event = this.read_scalar(event)?.to_pointer(this)?; + let event = this.deref_pointer_as(event, this.libc_ty_layout("epoll_event"))?; let epoll_ctl_add = this.eval_libc_i32("EPOLL_CTL_ADD"); let epoll_ctl_mod = this.eval_libc_i32("EPOLL_CTL_MOD"); let epoll_ctl_del = this.eval_libc_i32("EPOLL_CTL_DEL"); + let epollin = this.eval_libc_u32("EPOLLIN"); + let epollout = this.eval_libc_u32("EPOLLOUT"); + let epollrdhup = this.eval_libc_u32("EPOLLRDHUP"); + let epollet = this.eval_libc_u32("EPOLLET"); + + // Fail on unsupported operations. + if op & epoll_ctl_add != epoll_ctl_add + && op & epoll_ctl_mod != epoll_ctl_mod + && op & epoll_ctl_del != epoll_ctl_del + { + throw_unsup_format!("epoll_ctl: encountered unknown unsupported operation {:#x}", op); + } + + // Check if epfd is a valid epoll file descriptor. + let Some(epfd) = this.machine.fds.get_ref(epfd_value) else { + return Ok(Scalar::from_i32(this.fd_not_found()?)); + }; + let mut binding = epfd.borrow_mut(); + let epoll_file_description = &mut binding + .downcast_mut::() + .ok_or_else(|| err_unsup_format!("non-epoll FD passed to `epoll_ctl`"))?; + + let interest_list = &mut epoll_file_description.interest_list; + let ready_list = &epoll_file_description.ready_list; + + let Some(file_descriptor) = this.machine.fds.get_ref(fd) else { + return Ok(Scalar::from_i32(this.fd_not_found()?)); + }; + let id = file_descriptor.get_id(); if op == epoll_ctl_add || op == epoll_ctl_mod { - let event = this.deref_pointer_as(event, this.libc_ty_layout("epoll_event"))?; + // Read event bitmask and data from epoll_event passed by caller. + let events = this.read_scalar(&this.project_field(&event, 0)?)?.to_u32()?; + let data = this.read_scalar(&this.project_field(&event, 1)?)?.to_u64()?; - let events = this.project_field(&event, 0)?; - let events = this.read_scalar(&events)?.to_u32()?; - let data = this.project_field(&event, 1)?; - let data = this.read_scalar(&data)?; - let event = EpollEvent { events, data }; + // Unset the flag we support to discover if any unsupported flags are used. + let mut flags = events; + if events & epollet != epollet { + // We only support edge-triggered notification for now. + throw_unsup_format!("epoll_ctl: epollet flag must be included."); + } else { + flags &= !epollet; + } + if flags & epollin == epollin { + flags &= !epollin; + } + if flags & epollout == epollout { + flags &= !epollout; + } + if flags & epollrdhup == epollrdhup { + flags &= !epollrdhup; + } + if flags != 0 { + throw_unsup_format!( + "epoll_ctl: encountered unknown unsupported flags {:#x}", + flags + ); + } - let Some(mut epfd) = this.machine.fds.get_mut(epfd) else { - return Ok(Scalar::from_i32(this.fd_not_found()?)); - }; - let epfd = epfd - .downcast_mut::() - .ok_or_else(|| err_unsup_format!("non-epoll FD passed to `epoll_ctl`"))?; + let epoll_key = (id, fd); + + // Check the existence of fd in the interest list. + if op == epoll_ctl_add { + if interest_list.contains_key(&epoll_key) { + let eexist = this.eval_libc("EEXIST"); + this.set_last_error(eexist)?; + return Ok(Scalar::from_i32(-1)); + } + } else { + if !interest_list.contains_key(&epoll_key) { + let enoent = this.eval_libc("ENOENT"); + this.set_last_error(enoent)?; + return Ok(Scalar::from_i32(-1)); + } + } + + let id = file_descriptor.get_id(); + // Create an epoll_interest. + let interest = Rc::new(RefCell::new(EpollEventInterest { + file_descriptor: fd, + events, + data, + ready_list: Rc::clone(ready_list), + })); + + if op == epoll_ctl_add { + // Insert an epoll_interest to global epoll_interest list. + this.machine.epoll_interests.insert_epoll_interest(id, Rc::downgrade(&interest)); + interest_list.insert(epoll_key, interest); + } else { + // Directly modify the epoll_interest so the global epoll_event_interest table + // will be updated too. + let mut epoll_interest = interest_list.get_mut(&epoll_key).unwrap().borrow_mut(); + epoll_interest.events = events; + epoll_interest.data = data; + } + + // Readiness will be updated immediately when the epoll_event_interest is added or modified. + file_descriptor.check_and_update_readiness(this)?; - epfd.file_descriptors.insert(fd, event); - Ok(Scalar::from_i32(0)) + return Ok(Scalar::from_i32(0)); } else if op == epoll_ctl_del { - let Some(mut epfd) = this.machine.fds.get_mut(epfd) else { - return Ok(Scalar::from_i32(this.fd_not_found()?)); + let epoll_key = (id, fd); + + // Remove epoll_event_interest from interest_list. + let Some(epoll_interest) = interest_list.remove(&epoll_key) else { + let enoent = this.eval_libc("ENOENT"); + this.set_last_error(enoent)?; + return Ok(Scalar::from_i32(-1)); }; - let epfd = epfd - .downcast_mut::() - .ok_or_else(|| err_unsup_format!("non-epoll FD passed to `epoll_ctl`"))?; + // All related Weak will fail to upgrade after the drop. + drop(epoll_interest); - epfd.file_descriptors.remove(&fd); - Ok(Scalar::from_i32(0)) - } else { - let einval = this.eval_libc("EINVAL"); - this.set_last_error(einval)?; - Ok(Scalar::from_i32(-1)) + // Remove related epoll_interest from ready list. + ready_list.borrow_mut().remove(&epoll_key); + + // Remove dangling EpollEventInterest from its global table. + // .unwrap() below should succeed because the file description id must have registered + // at least one epoll_interest, if not, it will fail when removing epoll_interest from + // interest list. + this.machine + .epoll_interests + .get_epoll_interest_mut(id) + .unwrap() + .retain(|event| event.upgrade().is_some()); + + return Ok(Scalar::from_i32(0)); } + Ok(Scalar::from_i32(-1)) } /// The `epoll_wait()` system call waits for events on the `Epoll` @@ -166,25 +375,102 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { fn epoll_wait( &mut self, epfd: &OpTy<'tcx>, - events: &OpTy<'tcx>, + events_op: &OpTy<'tcx>, maxevents: &OpTy<'tcx>, timeout: &OpTy<'tcx>, ) -> InterpResult<'tcx, Scalar> { let this = self.eval_context_mut(); let epfd = this.read_scalar(epfd)?.to_i32()?; - let _events = this.read_scalar(events)?.to_pointer(this)?; - let _maxevents = this.read_scalar(maxevents)?.to_i32()?; - let _timeout = this.read_scalar(timeout)?.to_i32()?; + let maxevents = this.read_scalar(maxevents)?.to_i32()?; + let event = this.deref_pointer_as( + events_op, + this.libc_array_ty_layout("epoll_event", maxevents.try_into().unwrap()), + )?; + let timeout = this.read_scalar(timeout)?.to_i32()?; - let Some(mut epfd) = this.machine.fds.get_mut(epfd) else { + if epfd <= 0 { + let einval = this.eval_libc("EINVAL"); + this.set_last_error(einval)?; + return Ok(Scalar::from_i32(-1)); + } + // FIXME: Implement blocking support + if timeout != 0 { + throw_unsup_format!("epoll_wait: timeout value can only be 0"); + } + + let Some(epfd) = this.machine.fds.get_ref(epfd) else { return Ok(Scalar::from_i32(this.fd_not_found()?)); }; - let _epfd = epfd + let mut binding = epfd.borrow_mut(); + let epoll_file_description = &mut binding .downcast_mut::() .ok_or_else(|| err_unsup_format!("non-epoll FD passed to `epoll_wait`"))?; - // FIXME return number of events ready when scheme for marking events ready exists - throw_unsup_format!("returning ready events from epoll_wait is not yet implemented"); + let binding = epoll_file_description.get_ready_list(); + let mut ready_list = binding.borrow_mut(); + let mut num_of_events: i32 = 0; + let mut array_iter = this.project_array_fields(&event)?; + + while let Some((epoll_key, epoll_return)) = ready_list.pop_first() { + // If the file description is fully close, the entry for corresponding FdID in the + // global epoll event interest table would be empty. + if this.machine.epoll_interests.get_epoll_interest(epoll_key.0).is_some() { + // Return notification to the caller if the file description is not fully closed. + if let Some(des) = array_iter.next(this)? { + this.write_int_fields_named( + &[ + ("events", epoll_return.events.into()), + ("u64", epoll_return.data.into()), + ], + &des.1, + )?; + num_of_events = num_of_events.checked_add(1).unwrap(); + } else { + break; + } + } + } + Ok(Scalar::from_i32(num_of_events)) + } + + /// For a specific unique file descriptor id, get its ready events and update + /// the corresponding ready list. This function is called whenever a file description + /// is registered with epoll, or when read, write, or close operations are performed, + /// regardless of any changes in readiness. + /// + /// This is an internal helper function and is typically not meant to be used directly. + /// In most cases, `FileDescriptionRef::check_and_update_readiness` should be preferred. + fn check_and_update_readiness( + &self, + id: FdId, + get_ready_events: impl FnOnce() -> InterpResult<'tcx, EpollReadyEvents>, + ) -> InterpResult<'tcx, ()> { + let this = self.eval_context_ref(); + // Get a list of EpollEventInterest that is associated to a specific file description. + if let Some(epoll_interests) = this.machine.epoll_interests.get_epoll_interest(id) { + let epoll_ready_events = get_ready_events()?; + // Get the bitmask of ready events. + let ready_events = epoll_ready_events.get_event_bitmask(this); + + for weak_epoll_interest in epoll_interests { + if let Some(epoll_interest) = weak_epoll_interest.upgrade() { + // This checks if any of the events specified in epoll_event_interest.events + // match those in ready_events. + let epoll_event_interest = epoll_interest.borrow(); + let flags = epoll_event_interest.events & ready_events; + // If there is any event that we are interested in being specified as ready, + // insert an epoll_return to the ready list. + if flags != 0 { + let epoll_key = (id, epoll_event_interest.file_descriptor); + let ready_list = &mut epoll_event_interest.ready_list.borrow_mut(); + let event_instance = + EpollEventInstance::new(flags, epoll_event_interest.data); + ready_list.insert(epoll_key, event_instance); + } + } + } + } + Ok(()) } } diff --git a/src/shims/unix/linux/eventfd.rs b/src/shims/unix/linux/eventfd.rs index 4ab8760d93..8a11f225b2 100644 --- a/src/shims/unix/linux/eventfd.rs +++ b/src/shims/unix/linux/eventfd.rs @@ -3,8 +3,10 @@ use std::io; use std::io::{Error, ErrorKind}; use std::mem; +use fd::FdId; use rustc_target::abi::Endian; +use crate::shims::unix::linux::epoll::EpollReadyEvents; use crate::shims::unix::*; use crate::{concurrency::VClock, *}; @@ -35,9 +37,21 @@ impl FileDescription for Event { "event" } + fn get_epoll_ready_events<'tcx>(&self) -> InterpResult<'tcx, EpollReadyEvents> { + // We only check the status of EPOLLIN and EPOLLOUT flags for eventfd. If other event flags + // need to be supported in the future, the check should be added here. + + Ok(EpollReadyEvents { + epollin: self.counter != 0, + epollout: self.counter != MAX_COUNTER, + ..EpollReadyEvents::new() + }) + } + fn close<'tcx>( self: Box, _communicate_allowed: bool, + _ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result<()>> { Ok(Ok(())) } @@ -46,6 +60,7 @@ impl FileDescription for Event { fn read<'tcx>( &mut self, _communicate_allowed: bool, + fd_id: FdId, bytes: &mut [u8], ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -70,6 +85,18 @@ impl FileDescription for Event { Endian::Big => self.counter.to_be_bytes(), }; self.counter = 0; + // When any of the event happened, we check and update the status of all supported event + // types for current file description. + + // We have to use our own FdID in contrast to every other file descriptor out there, because + // we are updating ourselves when writing and reading. Technically `Event` is like socketpair, but + // it does not create two separate file descriptors. Thus we can't re-borrow ourselves via + // `FileDescriptionRef::check_and_update_readiness` while already being mutably borrowed for read/write. + crate::shims::unix::linux::epoll::EvalContextExt::check_and_update_readiness( + ecx, + fd_id, + || self.get_epoll_ready_events(), + )?; return Ok(Ok(U64_ARRAY_SIZE)); } } @@ -89,6 +116,7 @@ impl FileDescription for Event { fn write<'tcx>( &mut self, _communicate_allowed: bool, + fd_id: FdId, bytes: &[u8], ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -124,6 +152,17 @@ impl FileDescription for Event { } } }; + // When any of the event happened, we check and update the status of all supported event + // types for current file description. + + // Just like read() above, we use this internal method to not get the second borrow of the + // RefCell of this FileDescription. This is a special case, we should only use + // FileDescriptionRef::check_and_update_readiness in normal case. + crate::shims::unix::linux::epoll::EvalContextExt::check_and_update_readiness( + ecx, + fd_id, + || self.get_epoll_ready_events(), + )?; Ok(Ok(U64_ARRAY_SIZE)) } } @@ -178,11 +217,11 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { throw_unsup_format!("eventfd: encountered unknown unsupported flags {:#x}", flags); } - let fd = this.machine.fds.insert_new(Event { - counter: val.into(), - is_nonblock, - clock: VClock::default(), - }); - Ok(Scalar::from_i32(fd)) + let fds = &mut this.machine.fds; + + let fd_value = + fds.insert_new(Event { counter: val.into(), is_nonblock, clock: VClock::default() }); + + Ok(Scalar::from_i32(fd_value)) } } diff --git a/src/shims/unix/mod.rs b/src/shims/unix/mod.rs index dc9068fddd..8cfa659d90 100644 --- a/src/shims/unix/mod.rs +++ b/src/shims/unix/mod.rs @@ -17,6 +17,7 @@ mod solarish; pub use env::UnixEnvVars; pub use fd::{FdTable, FileDescription}; pub use fs::DirTable; +pub use linux::epoll::EpollInterestTable; // All the Unix-specific extension traits pub use env::EvalContextExt as _; pub use fd::EvalContextExt as _; diff --git a/src/shims/unix/socket.rs b/src/shims/unix/socket.rs index 455820a9e6..0f40d9776b 100644 --- a/src/shims/unix/socket.rs +++ b/src/shims/unix/socket.rs @@ -4,6 +4,8 @@ use std::io; use std::io::{Error, ErrorKind, Read}; use std::rc::{Rc, Weak}; +use crate::shims::unix::fd::{FdId, WeakFileDescriptionRef}; +use crate::shims::unix::linux::epoll::EpollReadyEvents; use crate::shims::unix::*; use crate::{concurrency::VClock, *}; @@ -19,6 +21,11 @@ struct SocketPair { // gone, and trigger EPIPE as appropriate. writebuf: Weak>, readbuf: Rc>, + /// When a socketpair instance is created, two socketpair file descriptions are generated. + /// The peer_fd field holds a weak reference to the file description of peer socketpair. + // TODO: It might be possible to retrieve writebuf from peer_fd and remove the writebuf + // field above. + peer_fd: WeakFileDescriptionRef, is_nonblock: bool, } @@ -37,21 +44,62 @@ impl FileDescription for SocketPair { "socketpair" } + fn get_epoll_ready_events<'tcx>(&self) -> InterpResult<'tcx, EpollReadyEvents> { + // We only check the status of EPOLLIN, EPOLLOUT and EPOLLRDHUP flags. If other event flags + // need to be supported in the future, the check should be added here. + + let mut epoll_ready_events = EpollReadyEvents::new(); + let readbuf = self.readbuf.borrow(); + + // Check if it is readable. + if !readbuf.buf.is_empty() { + epoll_ready_events.epollin = true; + } + + // Check if is writable. + if let Some(writebuf) = self.writebuf.upgrade() { + let writebuf = writebuf.borrow(); + let data_size = writebuf.buf.len(); + let available_space = MAX_SOCKETPAIR_BUFFER_CAPACITY.strict_sub(data_size); + if available_space != 0 { + epoll_ready_events.epollout = true; + } + } + + // Check if the peer_fd closed + if self.peer_fd.upgrade().is_none() { + epoll_ready_events.epollrdhup = true; + // This is an edge case. Whenever epollrdhup is triggered, epollin will be added + // even though there is no data in the buffer. + epoll_ready_events.epollin = true; + } + Ok(epoll_ready_events) + } + fn close<'tcx>( self: Box, _communicate_allowed: bool, + ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result<()>> { // This is used to signal socketfd of other side that there is no writer to its readbuf. // If the upgrade fails, there is no need to update as all read ends have been dropped. if let Some(writebuf) = self.writebuf.upgrade() { writebuf.borrow_mut().buf_has_writer = false; }; + + // Notify peer fd that closed has happened. + if let Some(peer_fd) = self.peer_fd.upgrade() { + // When any of the event happened, we check and update the status of all supported events + // types of peer fd. + peer_fd.check_and_update_readiness(ecx)?; + } Ok(Ok(())) } fn read<'tcx>( &mut self, _communicate_allowed: bool, + _fd_id: FdId, bytes: &mut [u8], ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -88,15 +136,33 @@ impl FileDescription for SocketPair { // FIXME: this over-synchronizes; a more precise approach would be to // only sync with the writes whose data we will read. ecx.acquire_clock(&readbuf.clock); + // Do full read / partial read based on the space available. // Conveniently, `read` exists on `VecDeque` and has exactly the desired behavior. let actual_read_size = readbuf.buf.read(bytes).unwrap(); + + // The readbuf needs to be explicitly dropped because it will cause panic when + // check_and_update_readiness borrows it again. + drop(readbuf); + + // A notification should be provided for the peer file description even when it can + // only write 1 byte. This implementation is not compliant with the actual Linux kernel + // implementation. For optimization reasons, the kernel will only mark the file description + // as "writable" when it can write more than a certain number of bytes. Since we + // don't know what that *certain number* is, we will provide a notification every time + // a read is successful. This might result in our epoll emulation providing more + // notifications than the real system. + if let Some(peer_fd) = self.peer_fd.upgrade() { + peer_fd.check_and_update_readiness(ecx)?; + } + return Ok(Ok(actual_read_size)); } fn write<'tcx>( &mut self, _communicate_allowed: bool, + _fd_id: FdId, bytes: &[u8], ecx: &mut MiriInterpCx<'tcx>, ) -> InterpResult<'tcx, io::Result> { @@ -131,6 +197,14 @@ impl FileDescription for SocketPair { // Do full write / partial write based on the space available. let actual_write_size = write_size.min(available_space); writebuf.buf.extend(&bytes[..actual_write_size]); + + // The writebuf needs to be explicitly dropped because it will cause panic when + // check_and_update_readiness borrows it again. + drop(writebuf); + // Notification should be provided for peer fd as it became readable. + if let Some(peer_fd) = self.peer_fd.upgrade() { + peer_fd.check_and_update_readiness(ecx)?; + } return Ok(Ok(actual_write_size)); } } @@ -209,18 +283,33 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { let socketpair_0 = SocketPair { writebuf: Rc::downgrade(&buffer1), readbuf: Rc::clone(&buffer2), + peer_fd: WeakFileDescriptionRef::default(), is_nonblock: is_sock_nonblock, }; - let socketpair_1 = SocketPair { writebuf: Rc::downgrade(&buffer2), readbuf: Rc::clone(&buffer1), + peer_fd: WeakFileDescriptionRef::default(), is_nonblock: is_sock_nonblock, }; + // Insert the file description to the fd table. let fds = &mut this.machine.fds; let sv0 = fds.insert_new(socketpair_0); let sv1 = fds.insert_new(socketpair_1); + + // Get weak file descriptor and file description id value. + let fd_ref0 = fds.get_ref(sv0).unwrap(); + let fd_ref1 = fds.get_ref(sv1).unwrap(); + let weak_fd_ref0 = fd_ref0.downgrade(); + let weak_fd_ref1 = fd_ref1.downgrade(); + + // Update peer_fd and id field. + fd_ref1.borrow_mut().downcast_mut::().unwrap().peer_fd = weak_fd_ref0; + + fd_ref0.borrow_mut().downcast_mut::().unwrap().peer_fd = weak_fd_ref1; + + // Return socketpair file description value to the caller. let sv0 = Scalar::from_int(sv0, sv.layout.size); let sv1 = Scalar::from_int(sv1, sv.layout.size); diff --git a/tests/fail-dep/tokio/sleep.rs b/tests/fail-dep/tokio/sleep.rs index d96d778e6c..0fa5080d48 100644 --- a/tests/fail-dep/tokio/sleep.rs +++ b/tests/fail-dep/tokio/sleep.rs @@ -1,6 +1,6 @@ //@compile-flags: -Zmiri-permissive-provenance -Zmiri-backtrace=full //@only-target-x86_64-unknown-linux: support for tokio only on linux and x86 -//@error-in-other-file: returning ready events from epoll_wait is not yet implemented +//@error-in-other-file: timeout value can only be 0 //@normalize-stderr-test: " += note:.*\n" -> "" use tokio::time::{sleep, Duration, Instant}; diff --git a/tests/fail-dep/tokio/sleep.stderr b/tests/fail-dep/tokio/sleep.stderr index 6d19faab90..d5bf00fc17 100644 --- a/tests/fail-dep/tokio/sleep.stderr +++ b/tests/fail-dep/tokio/sleep.stderr @@ -1,4 +1,4 @@ -error: unsupported operation: returning ready events from epoll_wait is not yet implemented +error: unsupported operation: epoll_wait: timeout value can only be 0 --> CARGO_REGISTRY/.../epoll.rs:LL:CC | LL | / syscall!(epoll_wait( @@ -7,7 +7,7 @@ LL | | events.as_mut_ptr(), LL | | events.capacity() as i32, LL | | timeout, LL | | )) - | |__________^ returning ready events from epoll_wait is not yet implemented + | |__________^ epoll_wait: timeout value can only be 0 | = help: this is likely not a bug in the program; it indicates that the program performed an operation that Miri does not support diff --git a/tests/pass-dep/libc/libc-epoll.rs b/tests/pass-dep/libc/libc-epoll.rs new file mode 100644 index 0000000000..11a0257dc4 --- /dev/null +++ b/tests/pass-dep/libc/libc-epoll.rs @@ -0,0 +1,552 @@ +//@only-target-linux + +#![feature(exposed_provenance)] // Needed for fn test_pointer() +use std::convert::TryInto; +use std::mem::MaybeUninit; + +fn main() { + test_event_overwrite(); + test_not_fully_closed_fd(); + test_closed_fd(); + test_epoll_socketpair_special_case(); + test_two_epoll_instance(); + test_epoll_ctl_mod(); + test_epoll_socketpair(); + test_epoll_eventfd(); + test_epoll_ctl_del(); + test_pointer(); + test_two_same_fd_in_same_epoll_instance(); + test_socketpair_read(); +} + +fn check_epoll_wait( + epfd: i32, + mut expected_notifications: Vec<(u32, u64)>, +) -> bool { + let epoll_event = libc::epoll_event { events: 0, u64: 0 }; + let mut array: [libc::epoll_event; N] = [epoll_event; N]; + let maxsize = N; + let array_ptr = array.as_mut_ptr(); + let res = unsafe { libc::epoll_wait(epfd, array_ptr, maxsize.try_into().unwrap(), 0) }; + assert_eq!(res, expected_notifications.len().try_into().unwrap()); + let slice = unsafe { std::slice::from_raw_parts(array_ptr, res.try_into().unwrap()) }; + let mut return_events = slice.iter(); + while let Some(return_event) = return_events.next() { + if let Some(notification) = expected_notifications.pop() { + let event = return_event.events; + let data = return_event.u64; + assert_eq!(event, notification.0); + assert_eq!(data, notification.1); + } else { + return false; + } + } + if !expected_notifications.is_empty() { + return false; + } + return true; +} + +fn test_epoll_socketpair() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let mut res = + unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Write to fd[0] + let data = "abcde".as_bytes().as_ptr(); + res = unsafe { libc::write(fds[0], data as *const libc::c_void, 5).try_into().unwrap() }; + assert_eq!(res, 5); + + // Register fd[1] with EPOLLIN|EPOLLOUT|EPOLLET + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT | libc::EPOLLRDHUP).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fds[1]).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + + // Check result from epoll_wait. + let expected_event = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value = u64::try_from(fds[1]).unwrap(); + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); + + // Close the peer socketpair. + let res = unsafe { libc::close(fds[0]) }; + assert_eq!(res, 0); + + // Check result from epoll_wait. + let expected_event = u32::try_from(libc::EPOLLRDHUP | libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value = u64::try_from(fds[1]).unwrap(); + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); +} + +fn test_epoll_ctl_mod() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let mut res = + unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Write to fd[0]. + let data = "abcde".as_bytes().as_ptr(); + res = unsafe { libc::write(fds[0], data as *const libc::c_void, 5).try_into().unwrap() }; + assert_eq!(res, 5); + + // Register fd[1] with EPOLLIN|EPOLLOUT|EPOLLET. + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let mut flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fds[1]).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + + // Check result from epoll_wait. + let expected_event = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value = u64::try_from(fds[1]).unwrap(); + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); + + // Test EPOLLRDHUP. + flags |= u32::try_from(libc::EPOLLRDHUP).unwrap(); + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fds[1]).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_MOD, fds[1], &mut ev) }; + assert_ne!(res, -1); + + // Close the other side of the socketpair to invoke EPOLLRDHUP. + let res = unsafe { libc::close(fds[0]) }; + assert_eq!(res, 0); + + // Check result from epoll_wait. + let expected_event = u32::try_from(libc::EPOLLRDHUP | libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value = u64::try_from(fds[1]).unwrap(); + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); +} + +fn test_epoll_ctl_del() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let mut res = + unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Write to fd[0] + let data = "abcde".as_bytes().as_ptr(); + res = unsafe { libc::write(fds[0], data as *const libc::c_void, 5).try_into().unwrap() }; + assert_eq!(res, 5); + + // Register fd[1] with EPOLLIN|EPOLLOUT|EPOLLET + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fds[1]).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + + // Test EPOLL_CTL_DEL. + assert!(check_epoll_wait::<0>(epfd, vec![])); +} + +// This test is for one fd registered under two different epoll instance. +fn test_two_epoll_instance() { + // Create two epoll instance. + let epfd1 = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd1, -1); + let epfd2 = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd2, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let mut res = + unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Write to the socketpair. + let data = "abcde".as_bytes().as_ptr(); + res = unsafe { libc::write(fds[0], data as *const libc::c_void, 5).try_into().unwrap() }; + assert_eq!(res, 5); + + // Register one side of the socketpair with EPOLLIN | EPOLLOUT | EPOLLET. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fds[1]).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd1, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + let res = unsafe { libc::epoll_ctl(epfd2, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + + // Notification should be received from both instance of epoll. + let expected_event = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value = u64::try_from(fds[1]).unwrap(); + assert!(check_epoll_wait::<8>(epfd1, vec![(expected_event, expected_value)])); + assert!(check_epoll_wait::<8>(epfd2, vec![(expected_event, expected_value)])); +} + +// This test is for two same file description registered under the same epoll instance through dup. +// Notification should be provided for both. +fn test_two_same_fd_in_same_epoll_instance() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let res = unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Dup the fd. + let newfd = unsafe { libc::dup(fds[1]) }; + assert_ne!(newfd, -1); + + // Register both fd to the same epoll instance. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { events: u32::try_from(flags).unwrap(), u64: 5 as u64 }; + let mut res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, newfd, &mut ev) }; + assert_ne!(res, -1); + + // Write to the socketpair. + let data = "abcde".as_bytes().as_ptr(); + res = unsafe { libc::write(fds[0], data as *const libc::c_void, 5).try_into().unwrap() }; + assert_eq!(res, 5); + + //Two notification should be received. + let expected_event = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value = 5 as u64; + assert!(check_epoll_wait::<8>( + epfd, + vec![(expected_event, expected_value), (expected_event, expected_value)] + )); +} + +fn test_epoll_eventfd() { + // Create an eventfd instance. + let flags = libc::EFD_NONBLOCK | libc::EFD_CLOEXEC; + let fd = unsafe { libc::eventfd(0, flags) }; + + // Write to the eventfd instance. + let sized_8_data: [u8; 8] = 1_u64.to_ne_bytes(); + let res: i32 = unsafe { + libc::write(fd, sized_8_data.as_ptr() as *const libc::c_void, 8).try_into().unwrap() + }; + assert_eq!(res, 8); + + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Register eventfd with EPOLLIN | EPOLLOUT | EPOLLET + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fd).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fd, &mut ev) }; + assert_ne!(res, -1); + + // Check result from epoll_wait. + let expected_event = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value = u64::try_from(fd).unwrap(); + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); +} + +fn test_pointer() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let res = unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Register fd[1] with EPOLLIN|EPOLLOUT|EPOLLET + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT | libc::EPOLLRDHUP).unwrap() | epollet; + let data = MaybeUninit::::uninit().as_ptr(); + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: data.expose_provenance() as u64, + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); +} + +// When read/write happened on one side of the socketpair, only the other side will be notified. +fn test_epoll_socketpair_special_case() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let res = unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Register both fd to the same epoll instance. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { events: u32::try_from(flags).unwrap(), u64: fds[0] as u64 }; + let mut res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[0], &mut ev) }; + assert_ne!(res, -1); + let mut ev = libc::epoll_event { events: u32::try_from(flags).unwrap(), u64: fds[1] as u64 }; + res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + + // Write to fds[1]. + let data = "abcde".as_bytes().as_ptr(); + res = unsafe { libc::write(fds[1], data as *const libc::c_void, 5).try_into().unwrap() }; + assert_eq!(res, 5); + + //Two notification should be received. + let expected_event0 = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value0 = fds[0] as u64; + let expected_event1 = u32::try_from(libc::EPOLLOUT).unwrap(); + let expected_value1 = fds[1] as u64; + assert!(check_epoll_wait::<8>( + epfd, + vec![(expected_event1, expected_value1), (expected_event0, expected_value0)] + )); + + // Read from fds[0]. + let mut buf: [u8; 5] = [0; 5]; + res = unsafe { + libc::read(fds[0], buf.as_mut_ptr().cast(), buf.len() as libc::size_t).try_into().unwrap() + }; + assert_eq!(res, 5); + assert_eq!(buf, "abcde".as_bytes()); + + // Notification should be provided for fds[1]. + let expected_event = u32::try_from(libc::EPOLLOUT).unwrap(); + let expected_value = fds[1] as u64; + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); +} + +// When file description is fully closed, epoll_wait should not provide any notification for +// that file description. +fn test_closed_fd() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create an eventfd instance. + let flags = libc::EFD_NONBLOCK | libc::EFD_CLOEXEC; + let fd = unsafe { libc::eventfd(0, flags) }; + + // Register eventfd with EPOLLIN | EPOLLOUT | EPOLLET + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fd).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fd, &mut ev) }; + assert_ne!(res, -1); + + // Write to the eventfd instance. + let sized_8_data: [u8; 8] = 1_u64.to_ne_bytes(); + let res: i32 = unsafe { + libc::write(fd, sized_8_data.as_ptr() as *const libc::c_void, 8).try_into().unwrap() + }; + assert_eq!(res, 8); + + // Close the eventfd. + let res = unsafe { libc::close(fd) }; + assert_eq!(res, 0); + + // No notification should be provided because the file description is closed. + assert!(check_epoll_wait::<8>(epfd, vec![])); +} + +// When a certain file descriptor registered with epoll is closed, but the underlying file description +// is not closed, notification should still be provided. +// +// This is a quirk of epoll being described in https://man7.org/linux/man-pages/man7/epoll.7.html +// A file descriptor is removed from an interest list only after all the file descriptors +// referring to the underlying open file description have been closed. +fn test_not_fully_closed_fd() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create an eventfd instance. + let flags = libc::EFD_NONBLOCK | libc::EFD_CLOEXEC; + let fd = unsafe { libc::eventfd(0, flags) }; + + // Dup the fd. + let newfd = unsafe { libc::dup(fd) }; + assert_ne!(newfd, -1); + + // Register eventfd with EPOLLIN | EPOLLOUT | EPOLLET + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fd).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fd, &mut ev) }; + assert_ne!(res, -1); + + // Close the original fd that being used to register with epoll. + let res = unsafe { libc::close(fd) }; + assert_eq!(res, 0); + + // Notification should still be provided because the file description is not closed. + let expected_event = u32::try_from(libc::EPOLLOUT).unwrap(); + let expected_value = fd as u64; + assert!(check_epoll_wait::<1>(epfd, vec![(expected_event, expected_value)])); + + // Write to the eventfd instance to produce notification. + let sized_8_data: [u8; 8] = 1_u64.to_ne_bytes(); + let res: i32 = unsafe { + libc::write(newfd, sized_8_data.as_ptr() as *const libc::c_void, 8).try_into().unwrap() + }; + assert_eq!(res, 8); + + // Close the dupped fd. + let res = unsafe { libc::close(newfd) }; + assert_eq!(res, 0); + + // No notification should be provided. + assert!(check_epoll_wait::<1>(epfd, vec![])); +} + +// Each time a notification is provided, it should reflect the file description's readiness +// at the moment the latest event occurred. +fn test_event_overwrite() { + // Create an eventfd instance. + let flags = libc::EFD_NONBLOCK | libc::EFD_CLOEXEC; + let fd = unsafe { libc::eventfd(0, flags) }; + + // Write to the eventfd instance. + let sized_8_data: [u8; 8] = 1_u64.to_ne_bytes(); + let res: i32 = unsafe { + libc::write(fd, sized_8_data.as_ptr() as *const libc::c_void, 8).try_into().unwrap() + }; + assert_eq!(res, 8); + + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Register eventfd with EPOLLIN | EPOLLOUT | EPOLLET + // EPOLLET is negative number for i32 so casting is needed to do proper bitwise OR for u32. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { + events: u32::try_from(flags).unwrap(), + u64: u64::try_from(fd).unwrap(), + }; + let res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fd, &mut ev) }; + assert_ne!(res, -1); + + // Read from the eventfd instance. + let mut buf: [u8; 8] = [0; 8]; + let res: i32 = unsafe { libc::read(fd, buf.as_mut_ptr().cast(), 8).try_into().unwrap() }; + assert_eq!(res, 8); + + // Check result from epoll_wait. + let expected_event = u32::try_from(libc::EPOLLOUT).unwrap(); + let expected_value = u64::try_from(fd).unwrap(); + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); +} + +// An epoll notification will be provided for every succesful read in a socketpair. +// This behaviour differs from the real system. +fn test_socketpair_read() { + // Create an epoll instance. + let epfd = unsafe { libc::epoll_create1(0) }; + assert_ne!(epfd, -1); + + // Create a socketpair instance. + let mut fds = [-1, -1]; + let res = unsafe { libc::socketpair(libc::AF_UNIX, libc::SOCK_STREAM, 0, fds.as_mut_ptr()) }; + assert_eq!(res, 0); + + // Register both fd to the same epoll instance. + let epollet = libc::EPOLLET as u32; + let flags = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap() | epollet; + let mut ev = libc::epoll_event { events: u32::try_from(flags).unwrap(), u64: fds[0] as u64 }; + let mut res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[0], &mut ev) }; + assert_ne!(res, -1); + let mut ev = libc::epoll_event { events: u32::try_from(flags).unwrap(), u64: fds[1] as u64 }; + res = unsafe { libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, fds[1], &mut ev) }; + assert_ne!(res, -1); + + // Write 5 bytes to fds[1]. + let data = "abcde".as_bytes().as_ptr(); + res = unsafe { libc::write(fds[1], data as *const libc::c_void, 5).try_into().unwrap() }; + assert_eq!(res, 5); + + //Two notification should be received. + let expected_event0 = u32::try_from(libc::EPOLLIN | libc::EPOLLOUT).unwrap(); + let expected_value0 = fds[0] as u64; + let expected_event1 = u32::try_from(libc::EPOLLOUT).unwrap(); + let expected_value1 = fds[1] as u64; + assert!(check_epoll_wait::<8>( + epfd, + vec![(expected_event1, expected_value1), (expected_event0, expected_value0)] + )); + + // Read 3 bytes from fds[0]. + let mut buf: [u8; 3] = [0; 3]; + res = unsafe { + libc::read(fds[0], buf.as_mut_ptr().cast(), buf.len() as libc::size_t).try_into().unwrap() + }; + assert_eq!(res, 3); + assert_eq!(buf, "abc".as_bytes()); + + // Notification will be provided. + // But in real system, no notification will be provided here. + let expected_event = u32::try_from(libc::EPOLLOUT).unwrap(); + let expected_value = fds[1] as u64; + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); + + // Read until the buffer is empty. + let mut buf: [u8; 2] = [0; 2]; + res = unsafe { + libc::read(fds[0], buf.as_mut_ptr().cast(), buf.len() as libc::size_t).try_into().unwrap() + }; + assert_eq!(res, 2); + assert_eq!(buf, "de".as_bytes()); + + // Notification will be provided. + // In real system, notification will be provided too. + let expected_event = u32::try_from(libc::EPOLLOUT).unwrap(); + let expected_value = fds[1] as u64; + assert!(check_epoll_wait::<8>(epfd, vec![(expected_event, expected_value)])); +}