diff --git a/doc/oci.md b/doc/oci.md new file mode 100644 index 0000000..dab7f1a --- /dev/null +++ b/doc/oci.md @@ -0,0 +1,53 @@ +# How to create a composefs from an OCI image + +This document is incomplete. It only serves to document some decisions we've +taken about how to resolve ambiguous situations. + +# Data precision + +We currently create a composefs image using the granularity of data as +typically appears in OCI tarballs: + - atime and ctime are not present (these are actually not physically present + in the erofs inode structure at all, either the compact or extended forms) + - mtime is set to the mtime in seconds; the sub-seconds value is simply + truncated (ie: we always round down). erofs has an nsec field, but it's not + normally present in OCI tarballs. That's down to the fact that the usual + tar header only has timestamps in seconds and extended headers are not + usually added for this purpose. + - we take great care to faithfully represent hardlinks: even though the + produced filesystem is read-only and we have data de-duplication via the + objects store, we make sure that hardlinks result in an actual shared inode + as visible via the `st_ino` and `st_nlink` fields on the mounted filesystem. + +We apply these precision restrictions also when creating images by scanning the +filesystem. For example: even if we get more-accurate timestamp information, +we'll truncate it to the nearest second. + +# Merging directories + +This is done according to the OCI spec, with an additional clarification: in +case a directory entry is present in multiple layers, we use the tar metadata +from the most-derived layer to determine the attributes (owner, permissions, +mtime) for the directory. + +# The root inode + +The root inode (/) is a difficult case because it doesn't always appear in the +layer tarballs. We need to make some arbitrary decisions about the metadata. + +Here's what we do: + + - if any layer tarball contains an empty for '/' then we'd like to use it. + The code for this doesn't exist yet, but it seems reasonable as a principle. + In case the `/` entry were to appear in multiple layers, we'd use the + most-derived layer in which it is present (as per the logic in the previous + section). + - otherwise: + - we assume that the root directory is owned by root:root and has `a+rx` + permissions (ie: `0555`). This matches the behaviour of podman. Note in + particular: podman uses `0555`, not `0755`: the root directory is not + (nominally) writable by the root user. + - the mtime of the root directory is taken to be equal to the most recent + file in the entire system, that is: the highest numerical value of any + mtime on any inode. The rationale is that this is usually a very good + proxy for "when was the (most-derived) container image created". diff --git a/examples/unified/.gitignore b/examples/unified/.gitignore new file mode 100644 index 0000000..acef31e --- /dev/null +++ b/examples/unified/.gitignore @@ -0,0 +1,5 @@ +/cfsctl +/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot +/fix-verity.efi +/image.qcow2 +/tmp/ diff --git a/examples/unified/Containerfile b/examples/unified/Containerfile new file mode 100644 index 0000000..3073ed5 --- /dev/null +++ b/examples/unified/Containerfile @@ -0,0 +1,48 @@ +# Need 6.12 kernel from rawhide +FROM fedora:rawhide AS base +COPY extra / +COPY cfsctl /usr/bin +RUN --mount=type=cache,target=/var/cache/libdnf5 < /etc/kernel/cmdline +EOF +RUN --mount=type=cache,target=/var/cache/libdnf5 < tmp/efi/loader/loader.conf +mkdir -p tmp/efi/EFI/BOOT tmp/efi/EFI/systemd +cp /usr/lib/systemd/boot/efi/systemd-bootx64.efi tmp/efi/EFI/systemd +cp /usr/lib/systemd/boot/efi/systemd-bootx64.efi tmp/efi/EFI/BOOT/BOOTX64.EFI +${CFSCTL} oci prepare-boot "${IMAGE_ID}" tmp/efi + +fakeroot ./make-image +qemu-img convert -f raw tmp/image.raw -O qcow2 image.qcow2 +./fix-verity image.qcow2 # https://github.com/tytso/e2fsprogs/issues/201 diff --git a/examples/unified/empty b/examples/unified/empty new file mode 100644 index 0000000..e69de29 diff --git a/examples/unified/extra/etc/resolv.conf b/examples/unified/extra/etc/resolv.conf new file mode 120000 index 0000000..697ba64 --- /dev/null +++ b/examples/unified/extra/etc/resolv.conf @@ -0,0 +1 @@ +../run/systemd/resolve/stub-resolv.conf \ No newline at end of file diff --git a/examples/unified/extra/usr/lib/dracut/dracut.conf.d/37composefs.conf b/examples/unified/extra/usr/lib/dracut/dracut.conf.d/37composefs.conf new file mode 100644 index 0000000..1defe5d --- /dev/null +++ b/examples/unified/extra/usr/lib/dracut/dracut.conf.d/37composefs.conf @@ -0,0 +1,6 @@ +# we want to make sure the virtio disk drivers get included +hostonly=no + +# we need to force these in via the initramfs because we don't have modules in +# the base image +force_drivers+=" virtio_net vfat " diff --git a/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot.service b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot.service new file mode 100644 index 0000000..3ba0562 --- /dev/null +++ b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot.service @@ -0,0 +1,34 @@ +# Copyright (C) 2013 Colin Walters +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library. If not, see . + +[Unit] +DefaultDependencies=no +ConditionKernelCommandLine=composefs +ConditionPathExists=/etc/initrd-release +After=sysroot.mount +Requires=sysroot.mount +Before=initrd-root-fs.target +Before=initrd-switch-root.target + +OnFailure=emergency.target +OnFailureJobMode=isolate + +[Service] +Type=oneshot +ExecStart=/usr/bin/composefs-pivot-sysroot +StandardInput=null +StandardOutput=journal +StandardError=journal+console +RemainAfterExit=yes diff --git a/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/module-setup.sh b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/module-setup.sh new file mode 100755 index 0000000..c4186c6 --- /dev/null +++ b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/module-setup.sh @@ -0,0 +1,20 @@ +#!/usr/bin/bash + +check() { + return 0 +} + +depends() { + return 0 +} + +install() { + inst \ + "${moddir}/composefs-pivot-sysroot" /bin/composefs-pivot-sysroot + inst \ + "${moddir}/composefs-pivot-sysroot.service" \ + "${systemdsystemunitdir}/composefs-pivot-sysroot.service" + + $SYSTEMCTL -q --root "${initdir}" add-wants \ + 'initrd-root-fs.target' 'composefs-pivot-sysroot.service' +} diff --git a/examples/unified/extra/usr/lib/kernel/install.conf.d/37composefs.conf b/examples/unified/extra/usr/lib/kernel/install.conf.d/37composefs.conf new file mode 100644 index 0000000..4d12c4e --- /dev/null +++ b/examples/unified/extra/usr/lib/kernel/install.conf.d/37composefs.conf @@ -0,0 +1,2 @@ +layout = uki +uki_generator = ukify diff --git a/examples/unified/extra/usr/lib/systemd/network/37-wired.network b/examples/unified/extra/usr/lib/systemd/network/37-wired.network new file mode 100644 index 0000000..e4e05fd --- /dev/null +++ b/examples/unified/extra/usr/lib/systemd/network/37-wired.network @@ -0,0 +1,9 @@ +[Match] +Type=ether + +[Link] +RequiredForOnline=routable + +[Network] +DHCP=yes + diff --git a/examples/unified/extra/usr/lib/systemd/system/systemd-growfs-root.service.d/37-composefs.conf b/examples/unified/extra/usr/lib/systemd/system/systemd-growfs-root.service.d/37-composefs.conf new file mode 100644 index 0000000..c387c18 --- /dev/null +++ b/examples/unified/extra/usr/lib/systemd/system/systemd-growfs-root.service.d/37-composefs.conf @@ -0,0 +1,6 @@ +# Make sure we grow the right root filesystem + +[Service] +ExecStart= +ExecStart=/usr/lib/systemd/systemd-growfs /sysroot + diff --git a/examples/unified/fix-verity b/examples/unified/fix-verity new file mode 100755 index 0000000..783a49a --- /dev/null +++ b/examples/unified/fix-verity @@ -0,0 +1,59 @@ +#!/bin/sh + +# workaround for https://github.com/tytso/e2fsprogs/issues/201 + +set -eux + +# We use a custom UKI with an initramfs containing a script that remounts +# /sysroot read-write and enables fs-verity on all of the objects in +# /composefs/objects. +# +# The first time we're run (or if we are modified) we (re-)generate the UKI. +# This is done inside of a container (for independence from the host OS). + +image_file="$1" + +if [ "$0" -nt fix-verity.efi ]; then + podman run --rm -i fedora > tmp/fix-verity.efi <<'EOF' + set -eux + + cat > /tmp/fix-verity.sh <<'EOS' + mount -o remount,rw /sysroot + ( + cd /sysroot/composefs/objects + echo >&2 'Enabling fsverity on composefs objects' + for i in */*; do + fsverity enable $i; + done + echo >&2 'done!' + ) + umount /sysroot + sync + poweroff -ff +EOS + + ( + dnf --setopt keepcache=1 install -y \ + kernel binutils systemd-boot-unsigned btrfs-progs fsverity-utils + dracut \ + --uefi \ + --no-hostonly \ + --install 'sync fsverity' \ + --include /tmp/fix-verity.sh /lib/dracut/hooks/pre-pivot/fix-verity.sh \ + --kver "$(rpm -q kernel-core --qf '%{VERSION}-%{RELEASE}.%{ARCH}')" \ + --kernel-cmdline="root=PARTLABEL=root-x86-64 console=ttyS0" \ + /tmp/fix-verity.efi + ) >&2 + + cat /tmp/fix-verity.efi +EOF + mv tmp/fix-verity.efi fix-verity.efi +fi + +qemu-system-x86_64 \ + -nographic \ + -m 4096 \ + -enable-kvm \ + -bios /usr/share/edk2/ovmf/OVMF_CODE.fd \ + -drive file="$1",if=virtio,media=disk \ + -kernel fix-verity.efi diff --git a/examples/unified/make-image b/examples/unified/make-image new file mode 100755 index 0000000..ff05a0f --- /dev/null +++ b/examples/unified/make-image @@ -0,0 +1,19 @@ +#!/bin/sh + +set -eux + +chown -R 0:0 tmp/sysroot +chcon -R system_u:object_r:usr_t:s0 tmp/sysroot/composefs +chcon system_u:object_r:var_t:s0 tmp/sysroot/var + +> tmp/image.raw +SYSTEMD_REPART_MKFS_OPTIONS_EXT4='-O verity' \ + systemd-repart \ + --empty=require \ + --size=auto \ + --dry-run=no \ + --no-pager \ + --offline=yes \ + --root=tmp \ + --definitions=repart.d \ + tmp/image.raw diff --git a/examples/unified/repart.d/01-esp.conf b/examples/unified/repart.d/01-esp.conf new file mode 100644 index 0000000..67f93e1 --- /dev/null +++ b/examples/unified/repart.d/01-esp.conf @@ -0,0 +1,6 @@ +[Partition] +Type=esp +Format=vfat +CopyFiles=/efi:/ +SizeMinBytes=512M +SizeMaxBytes=512M diff --git a/examples/unified/repart.d/02-sysroot.conf b/examples/unified/repart.d/02-sysroot.conf new file mode 100644 index 0000000..65f289e --- /dev/null +++ b/examples/unified/repart.d/02-sysroot.conf @@ -0,0 +1,6 @@ +[Partition] +Type=root +Format=ext4 +SizeMinBytes=10G +SizeMaxBytes=10G +CopyFiles=/sysroot:/ diff --git a/examples/unified/run b/examples/unified/run new file mode 100755 index 0000000..5742835 --- /dev/null +++ b/examples/unified/run @@ -0,0 +1,12 @@ +#!/bin/sh + +set -eux + +cd "${0%/*}" + +qemu-system-x86_64 \ + -m 4096 \ + -enable-kvm \ + -bios /usr/share/edk2/ovmf/OVMF_CODE.fd \ + -drive file=image.qcow2,if=virtio,cache=unsafe \ + -nic user,model=virtio-net-pci diff --git a/src/bin/cfsctl.rs b/src/bin/cfsctl.rs index 2437b2b..d62a03e 100644 --- a/src/bin/cfsctl.rs +++ b/src/bin/cfsctl.rs @@ -73,7 +73,9 @@ enum Command { /// Perform garbage collection GC, /// Imports a composefs image (unsafe!) - ImportImage { reference: String }, + ImportImage { + reference: String, + }, /// Commands for dealing with OCI layers Oci { #[clap(subcommand)] @@ -86,6 +88,12 @@ enum Command { /// the mountpoint mountpoint: String, }, + CreateImage { + path: PathBuf, + }, + CreateDumpfile { + path: PathBuf, + }, } fn main() -> Result<()> { @@ -165,6 +173,13 @@ fn main() -> Result<()> { oci::prepare_boot(&repo, name, None, &output)?; } }, + Command::CreateImage { ref path } => { + let image_id = composefs::fs::create_image(path, Some(&repo))?; + println!("{}", hex::encode(image_id)); + } + Command::CreateDumpfile { ref path } => { + composefs::fs::create_dumpfile(path)?; + } Command::Mount { name, mountpoint } => { repo.mount(&name, &mountpoint)?; } diff --git a/src/fs.rs b/src/fs.rs index cab30ff..5e7b08c 100644 --- a/src/fs.rs +++ b/src/fs.rs @@ -1,16 +1,31 @@ -use std::{ffi::OsStr, mem::MaybeUninit, path::Path}; +use std::{ + cell::RefCell, + collections::{BTreeMap, HashMap}, + ffi::OsString, + ffi::{CStr, OsStr}, + mem::MaybeUninit, + os::unix::ffi::{OsStrExt, OsStringExt}, + path::Path, + rc::Rc, +}; -use anyhow::Result; +use anyhow::{bail, ensure, Result}; use rustix::{ - fd::OwnedFd, - fs::{fdatasync, linkat, mkdirat, mknodat, openat, symlinkat, AtFlags, FileType, OFlags, CWD}, + fd::{AsFd, OwnedFd}, + fs::{ + fdatasync, fstat, getxattr, linkat, listxattr, mkdirat, mknodat, openat, readlinkat, + symlinkat, AtFlags, Dir, FileType, Mode, OFlags, CWD, + }, io::{read_uninit, write, Errno}, }; use crate::{ - image::{DirEnt, Directory, Inode, Leaf, LeafContent, Stat}, + fsverity::{digest::FsVerityHasher, Sha256HashValue}, + image::{DirEnt, Directory, FileSystem, Inode, Leaf, LeafContent, Stat}, repository::Repository, + selabel::selabel, util::proc_self_fd, + INLINE_CONTENT_MAX, }; fn set_file_contents(dirfd: &OwnedFd, name: &OsStr, stat: &Stat, data: &[u8]) -> Result<()> { @@ -97,7 +112,217 @@ fn write_directory_contents(dir: &Directory, fd: &OwnedFd, repo: &Repository) -> Ok(()) } +// NB: hardlinks not supported pub fn write_to_path(repo: &Repository, dir: &Directory, output_dir: &Path) -> Result<()> { let fd = openat(CWD, output_dir, OFlags::PATH | OFlags::DIRECTORY, 0.into())?; write_directory_contents(dir, &fd, repo) } + +pub struct FilesystemReader<'repo> { + st_dev: u64, + repo: Option<&'repo Repository>, + inodes: HashMap>, + root_mtime: i64, +} + +impl<'repo> FilesystemReader<'repo> { + fn read_xattrs(&mut self, fd: &OwnedFd) -> Result, Box<[u8]>>> { + // flistxattr() and fgetxattr() don't with with O_PATH fds, so go via /proc/self/fd. Note: + // we want the symlink-following version of this call, which produces the correct behaviour + // even when trying to read xattrs from symlinks themselves. See + // https://gist.github.com/allisonkarlitskaya/7a80f2ebb3314d80f45c653a1ba0e398 + let filename = proc_self_fd(fd); + + let mut xattrs = BTreeMap::new(); + + let names_size = listxattr(&filename, &mut [])?; + let mut names = vec![0; names_size]; + let actual_names_size = listxattr(&filename, &mut names)?; + ensure!( + actual_names_size == names.len(), + "xattrs changed during read" + ); + + let names: Vec = names.into_iter().map(|c| c as u8).collect(); // fml + + let mut buffer = [0; 65536]; + for name in names.split_inclusive(|c| *c == 0) { + let name = CStr::from_bytes_with_nul(name)?; + let value_size = getxattr(&filename, name, &mut buffer)?; + let key = Box::from(OsStr::from_bytes(name.to_bytes())); + let value = Box::from(&buffer[..value_size]); + xattrs.insert(key, value); + } + + Ok(xattrs) + } + + fn stat(&mut self, fd: &OwnedFd, ifmt: FileType) -> Result<(rustix::fs::Stat, Stat)> { + let buf = fstat(fd)?; + + ensure!( + FileType::from_raw_mode(buf.st_mode) == ifmt, + "File type changed + between readdir() and fstat()" + ); + + let mtime = buf.st_mtime as i64; + + if buf.st_dev != self.st_dev { + if self.st_dev == u64::MAX { + self.st_dev = buf.st_dev; + } else { + bail!("Attempting to cross devices while importing filesystem"); + } + } else { + // The root mtime is equal to the most recent mtime of any inode *except* the root + // directory. Because self.st_dev is unset at first, we know we're in this branch only + // if this is the second (or later) inode we process (ie: not the root directory). + if mtime > self.root_mtime { + self.root_mtime = mtime; + } + } + + Ok(( + buf, + Stat { + st_mode: buf.st_mode & 0o7777, + st_uid: buf.st_uid, + st_gid: buf.st_gid, + st_mtim_sec: mtime, + xattrs: RefCell::new(self.read_xattrs(fd)?), + }, + )) + } + + fn read_leaf_content(&mut self, fd: OwnedFd, buf: rustix::fs::Stat) -> Result { + let content = match FileType::from_raw_mode(buf.st_mode) { + FileType::Directory | FileType::Unknown => unreachable!(), + FileType::RegularFile => { + let mut buffer = vec![MaybeUninit::uninit(); buf.st_size as usize]; + let (data, _) = read_uninit(fd, &mut buffer)?; + + if buf.st_size > INLINE_CONTENT_MAX as i64 { + let id = if let Some(repo) = self.repo { + repo.ensure_object(data)? + } else { + FsVerityHasher::hash(data) + }; + LeafContent::ExternalFile(id, buf.st_size as u64) + } else { + LeafContent::InlineFile(Vec::from(data)) + } + } + FileType::Symlink => { + let target = readlinkat(fd, "", [])?; + LeafContent::Symlink(OsString::from_vec(target.into_bytes())) + } + FileType::CharacterDevice => LeafContent::CharacterDevice(buf.st_rdev), + FileType::BlockDevice => LeafContent::BlockDevice(buf.st_rdev), + FileType::Fifo => LeafContent::Fifo, + FileType::Socket => LeafContent::Socket, + }; + Ok(content) + } + + fn read_leaf(&mut self, dirfd: &OwnedFd, name: &OsStr, ifmt: FileType) -> Result> { + let oflags = match ifmt { + FileType::RegularFile => OFlags::RDONLY, + _ => OFlags::PATH, + }; + + let fd = openat( + dirfd, + name, + oflags | OFlags::NOFOLLOW | OFlags::CLOEXEC, + Mode::empty(), + )?; + + let (buf, stat) = self.stat(&fd, ifmt)?; + + if let Some(leafref) = self.inodes.get(&buf.st_ino) { + Ok(Rc::clone(leafref)) + } else { + let content = self.read_leaf_content(fd, buf)?; + let leaf = Rc::new(Leaf { stat, content }); + if buf.st_nlink > 1 { + self.inodes.insert(buf.st_ino, Rc::clone(&leaf)); + } + Ok(leaf) + } + } + + pub fn read_directory(&mut self, dirfd: impl AsFd, name: &OsStr) -> Result { + let fd = openat( + dirfd, + name, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::NOFOLLOW | OFlags::CLOEXEC, + Mode::empty(), + )?; + + let (_, stat) = self.stat(&fd, FileType::Directory)?; + let mut directory = Directory { + stat, + entries: vec![], + }; + + for item in Dir::read_from(&fd)? { + let entry = item?; + let name = OsStr::from_bytes(entry.file_name().to_bytes()); + + if name == "." || name == ".." { + continue; + } + + let inode = self.read_inode(&fd, name, entry.file_type())?; + directory.insert(name, inode); + } + + Ok(directory) + } + + fn read_inode(&mut self, dirfd: &OwnedFd, name: &OsStr, ifmt: FileType) -> Result { + if ifmt == FileType::Directory { + Ok(Inode::Directory(Box::new( + self.read_directory(dirfd, name)?, + ))) + } else { + Ok(Inode::Leaf(self.read_leaf(dirfd, name, ifmt)?)) + } + } +} + +pub fn read_from_path(path: &Path, repo: Option<&Repository>) -> Result { + let mut reader = FilesystemReader { + repo, + inodes: HashMap::new(), + st_dev: u64::MAX, + root_mtime: 0, + }; + let mut fs = FileSystem { + root: reader.read_directory(CWD, path.as_os_str())?, + }; + fs.root.stat.st_mtim_sec = reader.root_mtime; + + // We can only relabel if we have the repo because we need to read the config and policy files + if let Some(repo) = repo { + selabel(&mut fs, repo)?; + } + + Ok(fs) +} + +pub fn create_image(path: &Path, repo: Option<&Repository>) -> Result { + let fs = read_from_path(path, repo)?; + let image = super::image::mkcomposefs(fs)?; + if let Some(repo) = repo { + Ok(repo.write_image(None, &image)?) + } else { + Ok(FsVerityHasher::hash(&image)) + } +} + +pub fn create_dumpfile(path: &Path) -> Result<()> { + let fs = read_from_path(path, None)?; + super::dumpfile::write_dumpfile(&mut std::io::stdout(), &fs) +} diff --git a/src/image.rs b/src/image.rs index ae635ad..b03049a 100644 --- a/src/image.rs +++ b/src/image.rs @@ -1,6 +1,7 @@ use std::{ cell::RefCell, cmp::{Ord, Ordering}, + collections::BTreeMap, ffi::{OsStr, OsString}, io::Read, path::Path, @@ -18,7 +19,7 @@ pub struct Stat { pub st_uid: u32, pub st_gid: u32, pub st_mtim_sec: i64, - pub xattrs: RefCell)>>, + pub xattrs: RefCell, Box<[u8]>>>, } #[derive(Debug)] @@ -113,11 +114,11 @@ impl Directory { } } - pub fn insert(&mut self, name: &OsStr, leaf: Rc) { + pub fn insert(&mut self, name: &OsStr, inode: Inode) { match self.find_entry(name) { Ok(idx) => { // found existing item - self.entries[idx].inode = Inode::Leaf(leaf); + self.entries[idx].inode = inode; } Err(idx) => { // need to add new item @@ -125,7 +126,7 @@ impl Directory { idx, DirEnt { name: OsString::from(name), - inode: Inode::Leaf(leaf), + inode, }, ); } @@ -154,6 +155,20 @@ impl Directory { pub fn remove_all(&mut self) { self.entries.clear(); } + + pub fn newest_file(&self) -> i64 { + let mut newest = self.stat.st_mtim_sec; + for DirEnt { inode, .. } in &self.entries { + let mtime = match inode { + Inode::Leaf(ref leaf) => leaf.stat.st_mtim_sec, + Inode::Directory(ref dir) => dir.newest_file(), + }; + if mtime > newest { + newest = mtime; + } + } + newest + } } pub struct FileSystem { @@ -171,11 +186,11 @@ impl FileSystem { FileSystem { root: Directory { stat: Stat { - st_mode: 0o755, - st_uid: 0, - st_gid: 0, - st_mtim_sec: 0, - xattrs: RefCell::new(vec![]), + st_mode: u32::MAX, // assigned later + st_uid: u32::MAX, // assigned later + st_gid: u32::MAX, // assigned later + st_mtim_sec: -1, // assigned later + xattrs: RefCell::new(BTreeMap::new()), }, entries: vec![], }, @@ -211,7 +226,7 @@ impl FileSystem { pub fn insert_rc(&mut self, name: &Path, leaf: Rc) -> Result<()> { if let Some(filename) = name.file_name() { let dir = self.get_parent_dir(name)?; - dir.insert(filename, leaf); + dir.insert(filename, Inode::Leaf(leaf)); Ok(()) } else { todo!() @@ -245,6 +260,25 @@ impl FileSystem { todo!(); } } + + pub fn done(&mut self) { + // We need to look at the root entry and deal with the "assign later" fields + let stat = &mut self.root.stat; + + if stat.st_mode == u32::MAX { + stat.st_mode = 0o555; + } + if stat.st_uid == u32::MAX { + stat.st_uid = 0; + } + if stat.st_gid == u32::MAX { + stat.st_gid = 0; + } + if stat.st_mtim_sec == -1 { + // write this in full to avoid annoying the borrow checker + self.root.stat.st_mtim_sec = self.root.newest_file(); + } + } } pub fn mkcomposefs(filesystem: FileSystem) -> Result> { diff --git a/src/lib.rs b/src/lib.rs index dd9b06a..dccc9b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,3 +9,8 @@ pub mod repository; pub mod selabel; pub mod splitstream; pub mod util; + +/// All files that contain 64 or fewer bytes (size <= INLINE_CONTENT_MAX) should be stored inline +/// in the erofs image (and also in splitstreams). All files with 65 or more bytes (size > MAX) +/// should be written to the object storage and referred to from the image (and splitstreams). +pub const INLINE_CONTENT_MAX: usize = 64; diff --git a/src/oci/image.rs b/src/oci/image.rs index 772c28e..ffec4e9 100644 --- a/src/oci/image.rs +++ b/src/oci/image.rs @@ -6,7 +6,7 @@ use oci_spec::image::ImageConfiguration; use crate::{ dumpfile::write_dumpfile, fsverity::Sha256HashValue, - image::{mkcomposefs, FileSystem, Leaf}, + image::{mkcomposefs, FileSystem, Inode, Leaf}, oci, repository::Repository, selabel::selabel, @@ -39,10 +39,10 @@ pub fn process_entry(filesystem: &mut FileSystem, entry: oci::tar::TarEntry) -> oci::tar::TarItem::Directory => dir.mkdir(filename, entry.stat), oci::tar::TarItem::Leaf(content) => dir.insert( filename, - Rc::new(Leaf { + Inode::Leaf(Rc::new(Leaf { stat: entry.stat, content, - }), + })), ), oci::tar::TarItem::Hardlink(ref target) => { // TODO: would be nice to do this inline, but borrow checker doesn't like it @@ -65,6 +65,7 @@ pub fn compose_filesystem(repo: &Repository, layers: &[String]) -> Result oci::tar::TarEntry { @@ -117,7 +119,7 @@ fn file_entry(path: &str) -> oci::tar::TarEntry { st_uid: 0, st_gid: 0, st_mtim_sec: 0, - xattrs: RefCell::new(vec![]), + xattrs: RefCell::new(BTreeMap::new()), }, item: oci::tar::TarItem::Leaf(LeafContent::InlineFile(vec![])), } @@ -132,7 +134,7 @@ fn dir_entry(path: &str) -> oci::tar::TarEntry { st_uid: 0, st_gid: 0, st_mtim_sec: 0, - xattrs: RefCell::new(vec![]), + xattrs: RefCell::new(BTreeMap::new()), }, item: oci::tar::TarItem::Directory, } diff --git a/src/oci/tar.rs b/src/oci/tar.rs index 74806ab..e33af5c 100644 --- a/src/oci/tar.rs +++ b/src/oci/tar.rs @@ -1,5 +1,6 @@ use std::{ cell::RefCell, + collections::BTreeMap, ffi::{OsStr, OsString}, fmt, io::Read, @@ -7,7 +8,7 @@ use std::{ path::PathBuf, }; -use anyhow::{bail, Result}; +use anyhow::{bail, ensure, Result}; use rustix::fs::makedev; use tar::{EntryType, Header, PaxExtensions}; use tokio::io::{AsyncRead, AsyncReadExt}; @@ -17,6 +18,7 @@ use crate::{ image::{LeafContent, Stat}, splitstream::{SplitStreamData, SplitStreamReader, SplitStreamWriter}, util::{read_exactish, read_exactish_async}, + INLINE_CONTENT_MAX, }; fn read_header(reader: &mut R) -> Result> { @@ -55,7 +57,7 @@ pub fn split(tar_stream: &mut R, writer: &mut SplitStreamWriter) -> Res let mut buffer = vec![0u8; storage_size]; tar_stream.read_exact(&mut buffer)?; - if header.entry_type() == EntryType::Regular && storage_size > 0 { + if header.entry_type() == EntryType::Regular && actual_size > INLINE_CONTENT_MAX { // non-empty regular file: store the data in the object store let padding = buffer.split_off(actual_size); writer.write_external(&buffer, padding)?; @@ -85,7 +87,7 @@ pub async fn split_async( let mut buffer = vec![0u8; storage_size]; tar_stream.read_exact(&mut buffer).await?; - if header.entry_type() == EntryType::Regular && storage_size > 0 { + if header.entry_type() == EntryType::Regular && actual_size > INLINE_CONTENT_MAX { // non-empty regular file: store the data in the object store let padding = buffer.split_off(actual_size); writer.write_external(&buffer, padding)?; @@ -123,7 +125,7 @@ impl fmt::Display for TarEntry { } } -fn path_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> PathBuf { +fn path_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> PathBuf { // Prepend leading / let mut path = vec![b'/']; if let Some(name) = pax { @@ -144,9 +146,9 @@ fn path_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> PathBuf { PathBuf::from(OsString::from_vec(path)) } -fn symlink_target_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> OsString { - if let Some(name) = pax { - OsString::from_vec(name) +fn symlink_target_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> OsString { + if let Some(ref name) = pax { + OsString::from(OsStr::from_bytes(name)) } else if !gnu.is_empty() { OsString::from_vec(gnu) } else { @@ -157,9 +159,9 @@ fn symlink_target_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> pub fn get_entry(reader: &mut SplitStreamReader) -> Result> { let mut gnu_longlink: Vec = vec![]; let mut gnu_longname: Vec = vec![]; - let mut pax_longlink: Option> = None; - let mut pax_longname: Option> = None; - let mut xattrs = vec![]; + let mut pax_longlink: Option> = None; + let mut pax_longname: Option> = None; + let mut xattrs = BTreeMap::new(); loop { let mut buf = [0u8; 512]; @@ -175,6 +177,10 @@ pub fn get_entry(reader: &mut SplitStreamReader) -> Result match header.entry_type() { EntryType::Regular | EntryType::Continuous => { + ensure!( + size as usize > INLINE_CONTENT_MAX, + "Splitstream incorrectly stored a small ({size} byte) file external" + ); TarItem::Leaf(LeafContent::ExternalFile(id, size)) } _ => bail!( @@ -199,20 +205,25 @@ pub fn get_entry(reader: &mut SplitStreamReader) -> Result TarItem::Directory, EntryType::Regular | EntryType::Continuous => { + ensure!( + content.len() <= INLINE_CONTENT_MAX, + "Splitstream incorrectly stored a large ({} byte) file inline", + content.len() + ); TarItem::Leaf(LeafContent::InlineFile(content)) } EntryType::Link => TarItem::Hardlink({ diff --git a/src/selabel.rs b/src/selabel.rs index f3ae50b..35978e9 100644 --- a/src/selabel.rs +++ b/src/selabel.rs @@ -199,11 +199,13 @@ impl Policy { } fn relabel(stat: &Stat, path: &Path, ifmt: u8, policy: &mut Policy) { + let security_selinux = OsStr::new("security.selinux"); // no literal syntax for this yet + let mut xattrs = stat.xattrs.borrow_mut(); + if let Some(label) = policy.lookup(path.as_os_str(), ifmt) { - stat.xattrs.borrow_mut().push(( - OsString::from("security.selinux"), - Vec::from(label.as_bytes()), - )) + xattrs.insert(Box::from(security_selinux), Box::from(label.as_bytes())); + } else { + xattrs.remove(security_selinux); } }