From b480adceb84902ccaf903f4f74b3101ea6ae2629 Mon Sep 17 00:00:00 2001 From: Allison Karlitskaya Date: Fri, 15 Nov 2024 11:05:48 +0100 Subject: [PATCH 1/5] src: add INLINE_CONTENT_MAX constant We've been doing this incorrectly by storing all non-empty files externally. Add a constant and use it internally. This means that currently-existing splitstreams need to be regenerated: they'll have also stored small files as external references. Add some extra checks at the splitstream-to-image stage that verifies that the splitstream has followed the rules correctly: this will help identify older streams that were built with incorrect rules. This is another "delete your respository and start over" change. We *could* provide bridging code here: in case of too-small external files in the splitstream, we could read them from the repository and convert them to inline, but let's save ourselves the bother. Closes #26 Signed-off-by: Allison Karlitskaya --- src/lib.rs | 5 +++++ src/oci/tar.rs | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index dd9b06a..dccc9b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,3 +9,8 @@ pub mod repository; pub mod selabel; pub mod splitstream; pub mod util; + +/// All files that contain 64 or fewer bytes (size <= INLINE_CONTENT_MAX) should be stored inline +/// in the erofs image (and also in splitstreams). All files with 65 or more bytes (size > MAX) +/// should be written to the object storage and referred to from the image (and splitstreams). +pub const INLINE_CONTENT_MAX: usize = 64; diff --git a/src/oci/tar.rs b/src/oci/tar.rs index 74806ab..82f8539 100644 --- a/src/oci/tar.rs +++ b/src/oci/tar.rs @@ -7,7 +7,7 @@ use std::{ path::PathBuf, }; -use anyhow::{bail, Result}; +use anyhow::{bail, ensure, Result}; use rustix::fs::makedev; use tar::{EntryType, Header, PaxExtensions}; use tokio::io::{AsyncRead, AsyncReadExt}; @@ -17,6 +17,7 @@ use crate::{ image::{LeafContent, Stat}, splitstream::{SplitStreamData, SplitStreamReader, SplitStreamWriter}, util::{read_exactish, read_exactish_async}, + INLINE_CONTENT_MAX, }; fn read_header(reader: &mut R) -> Result> { @@ -55,7 +56,7 @@ pub fn split(tar_stream: &mut R, writer: &mut SplitStreamWriter) -> Res let mut buffer = vec![0u8; storage_size]; tar_stream.read_exact(&mut buffer)?; - if header.entry_type() == EntryType::Regular && storage_size > 0 { + if header.entry_type() == EntryType::Regular && actual_size > INLINE_CONTENT_MAX { // non-empty regular file: store the data in the object store let padding = buffer.split_off(actual_size); writer.write_external(&buffer, padding)?; @@ -85,7 +86,7 @@ pub async fn split_async( let mut buffer = vec![0u8; storage_size]; tar_stream.read_exact(&mut buffer).await?; - if header.entry_type() == EntryType::Regular && storage_size > 0 { + if header.entry_type() == EntryType::Regular && actual_size > INLINE_CONTENT_MAX { // non-empty regular file: store the data in the object store let padding = buffer.split_off(actual_size); writer.write_external(&buffer, padding)?; @@ -175,6 +176,10 @@ pub fn get_entry(reader: &mut SplitStreamReader) -> Result match header.entry_type() { EntryType::Regular | EntryType::Continuous => { + ensure!( + size as usize > INLINE_CONTENT_MAX, + "Splitstream incorrectly stored a small ({size} byte) file external" + ); TarItem::Leaf(LeafContent::ExternalFile(id, size)) } _ => bail!( @@ -213,6 +218,11 @@ pub fn get_entry(reader: &mut SplitStreamReader) -> Result TarItem::Directory, EntryType::Regular | EntryType::Continuous => { + ensure!( + content.len() <= INLINE_CONTENT_MAX, + "Splitstream incorrectly stored a large ({} byte) file inline", + content.len() + ); TarItem::Leaf(LeafContent::InlineFile(content)) } EntryType::Link => TarItem::Hardlink({ From 389359472d76627db6e5bd5e5d92f65f8fec6d22 Mon Sep 17 00:00:00 2001 From: Allison Karlitskaya Date: Fri, 15 Nov 2024 11:10:35 +0100 Subject: [PATCH 2/5] image: support inserting any inode The `.insert()` operation for adding directory entries to the in-memory filesystem structure previously only supported adding "leaf" inodes, with the `.mkdir()` operation to be used for directories. Made `.insert()` take any Inode instead of just Leaf. This will be helpful for our incoming filesystem-scanning code. Signed-off-by: Allison Karlitskaya --- src/image.rs | 8 ++++---- src/oci/image.rs | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/image.rs b/src/image.rs index ae635ad..a9ce3dc 100644 --- a/src/image.rs +++ b/src/image.rs @@ -113,11 +113,11 @@ impl Directory { } } - pub fn insert(&mut self, name: &OsStr, leaf: Rc) { + pub fn insert(&mut self, name: &OsStr, inode: Inode) { match self.find_entry(name) { Ok(idx) => { // found existing item - self.entries[idx].inode = Inode::Leaf(leaf); + self.entries[idx].inode = inode; } Err(idx) => { // need to add new item @@ -125,7 +125,7 @@ impl Directory { idx, DirEnt { name: OsString::from(name), - inode: Inode::Leaf(leaf), + inode, }, ); } @@ -211,7 +211,7 @@ impl FileSystem { pub fn insert_rc(&mut self, name: &Path, leaf: Rc) -> Result<()> { if let Some(filename) = name.file_name() { let dir = self.get_parent_dir(name)?; - dir.insert(filename, leaf); + dir.insert(filename, Inode::Leaf(leaf)); Ok(()) } else { todo!() diff --git a/src/oci/image.rs b/src/oci/image.rs index 772c28e..e48a0b8 100644 --- a/src/oci/image.rs +++ b/src/oci/image.rs @@ -6,7 +6,7 @@ use oci_spec::image::ImageConfiguration; use crate::{ dumpfile::write_dumpfile, fsverity::Sha256HashValue, - image::{mkcomposefs, FileSystem, Leaf}, + image::{mkcomposefs, FileSystem, Inode, Leaf}, oci, repository::Repository, selabel::selabel, @@ -39,10 +39,10 @@ pub fn process_entry(filesystem: &mut FileSystem, entry: oci::tar::TarEntry) -> oci::tar::TarItem::Directory => dir.mkdir(filename, entry.stat), oci::tar::TarItem::Leaf(content) => dir.insert( filename, - Rc::new(Leaf { + Inode::Leaf(Rc::new(Leaf { stat: entry.stat, content, - }), + })), ), oci::tar::TarItem::Hardlink(ref target) => { // TODO: would be nice to do this inline, but borrow checker doesn't like it From b556397f86117bc2cae4683000fde974aa8b0c5b Mon Sep 17 00:00:00 2001 From: Allison Karlitskaya Date: Fri, 15 Nov 2024 16:15:30 +0100 Subject: [PATCH 3/5] image: store xattrs in a BTreeMap What's wrong with Vec<(OsString, Vec)>? It's hard to do lookups, or additions that replace existing values. What's wrong with HashMap? It's not sorted. We'd like to sort the output when producing dumpfiles. We also move from OsString to Box and from Vec to Box<[u8]>. These types better represent the immutability of these values once they're in the map. OsString and Vec are both based around being mutable values and even might have extra space allocated. Propagate these changes through the PAX handling in the tar code and move it over to using more reasonable types (with similar rationale). Tweak our SELinux relabel code to work in the presence of a filesystem tree where labels may already be partially present: avoid adding duplicate xattrs and remove labels in case none should be set. Signed-off-by: Allison Karlitskaya --- src/image.rs | 5 +++-- src/oci/image.rs | 6 +++--- src/oci/tar.rs | 19 ++++++++++--------- src/selabel.rs | 10 ++++++---- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/image.rs b/src/image.rs index a9ce3dc..b75ebfe 100644 --- a/src/image.rs +++ b/src/image.rs @@ -1,6 +1,7 @@ use std::{ cell::RefCell, cmp::{Ord, Ordering}, + collections::BTreeMap, ffi::{OsStr, OsString}, io::Read, path::Path, @@ -18,7 +19,7 @@ pub struct Stat { pub st_uid: u32, pub st_gid: u32, pub st_mtim_sec: i64, - pub xattrs: RefCell)>>, + pub xattrs: RefCell, Box<[u8]>>>, } #[derive(Debug)] @@ -175,7 +176,7 @@ impl FileSystem { st_uid: 0, st_gid: 0, st_mtim_sec: 0, - xattrs: RefCell::new(vec![]), + xattrs: RefCell::new(BTreeMap::new()), }, entries: vec![], }, diff --git a/src/oci/image.rs b/src/oci/image.rs index e48a0b8..098c1ad 100644 --- a/src/oci/image.rs +++ b/src/oci/image.rs @@ -106,7 +106,7 @@ pub fn create_image( #[cfg(test)] use crate::image::{LeafContent, Stat}; #[cfg(test)] -use std::{cell::RefCell, io::BufRead, path::PathBuf}; +use std::{cell::RefCell, collections::BTreeMap, io::BufRead, path::PathBuf}; #[cfg(test)] fn file_entry(path: &str) -> oci::tar::TarEntry { @@ -117,7 +117,7 @@ fn file_entry(path: &str) -> oci::tar::TarEntry { st_uid: 0, st_gid: 0, st_mtim_sec: 0, - xattrs: RefCell::new(vec![]), + xattrs: RefCell::new(BTreeMap::new()), }, item: oci::tar::TarItem::Leaf(LeafContent::InlineFile(vec![])), } @@ -132,7 +132,7 @@ fn dir_entry(path: &str) -> oci::tar::TarEntry { st_uid: 0, st_gid: 0, st_mtim_sec: 0, - xattrs: RefCell::new(vec![]), + xattrs: RefCell::new(BTreeMap::new()), }, item: oci::tar::TarItem::Directory, } diff --git a/src/oci/tar.rs b/src/oci/tar.rs index 82f8539..e33af5c 100644 --- a/src/oci/tar.rs +++ b/src/oci/tar.rs @@ -1,5 +1,6 @@ use std::{ cell::RefCell, + collections::BTreeMap, ffi::{OsStr, OsString}, fmt, io::Read, @@ -124,7 +125,7 @@ impl fmt::Display for TarEntry { } } -fn path_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> PathBuf { +fn path_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> PathBuf { // Prepend leading / let mut path = vec![b'/']; if let Some(name) = pax { @@ -145,9 +146,9 @@ fn path_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> PathBuf { PathBuf::from(OsString::from_vec(path)) } -fn symlink_target_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> OsString { - if let Some(name) = pax { - OsString::from_vec(name) +fn symlink_target_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> OsString { + if let Some(ref name) = pax { + OsString::from(OsStr::from_bytes(name)) } else if !gnu.is_empty() { OsString::from_vec(gnu) } else { @@ -158,9 +159,9 @@ fn symlink_target_from_tar(pax: Option>, gnu: Vec, short: &[u8]) -> pub fn get_entry(reader: &mut SplitStreamReader) -> Result> { let mut gnu_longlink: Vec = vec![]; let mut gnu_longname: Vec = vec![]; - let mut pax_longlink: Option> = None; - let mut pax_longname: Option> = None; - let mut xattrs = vec![]; + let mut pax_longlink: Option> = None; + let mut pax_longname: Option> = None; + let mut xattrs = BTreeMap::new(); loop { let mut buf = [0u8; 512]; @@ -204,14 +205,14 @@ pub fn get_entry(reader: &mut SplitStreamReader) -> Result Date: Fri, 15 Nov 2024 11:13:09 +0100 Subject: [PATCH 4/5] oci: Firm up some questions about the / entry The / entry doesn't appear in many layer tarballs. Until now, we've arbitrarily created it root:root, 0755, with mtime set to the epoch. Let's start thinking about this a bit more rigorously. Add a doc/oci.md with some of these decisions spelled out more explicitly. The upshot: we now use 0555 instead of 0755 and we set the mtime to the mtime of the newest file in the filesystem (instead of the epoch). Signed-off-by: Allison Karlitskaya --- doc/oci.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ src/image.rs | 41 +++++++++++++++++++++++++++++++++---- src/oci/image.rs | 2 ++ 3 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 doc/oci.md diff --git a/doc/oci.md b/doc/oci.md new file mode 100644 index 0000000..dab7f1a --- /dev/null +++ b/doc/oci.md @@ -0,0 +1,53 @@ +# How to create a composefs from an OCI image + +This document is incomplete. It only serves to document some decisions we've +taken about how to resolve ambiguous situations. + +# Data precision + +We currently create a composefs image using the granularity of data as +typically appears in OCI tarballs: + - atime and ctime are not present (these are actually not physically present + in the erofs inode structure at all, either the compact or extended forms) + - mtime is set to the mtime in seconds; the sub-seconds value is simply + truncated (ie: we always round down). erofs has an nsec field, but it's not + normally present in OCI tarballs. That's down to the fact that the usual + tar header only has timestamps in seconds and extended headers are not + usually added for this purpose. + - we take great care to faithfully represent hardlinks: even though the + produced filesystem is read-only and we have data de-duplication via the + objects store, we make sure that hardlinks result in an actual shared inode + as visible via the `st_ino` and `st_nlink` fields on the mounted filesystem. + +We apply these precision restrictions also when creating images by scanning the +filesystem. For example: even if we get more-accurate timestamp information, +we'll truncate it to the nearest second. + +# Merging directories + +This is done according to the OCI spec, with an additional clarification: in +case a directory entry is present in multiple layers, we use the tar metadata +from the most-derived layer to determine the attributes (owner, permissions, +mtime) for the directory. + +# The root inode + +The root inode (/) is a difficult case because it doesn't always appear in the +layer tarballs. We need to make some arbitrary decisions about the metadata. + +Here's what we do: + + - if any layer tarball contains an empty for '/' then we'd like to use it. + The code for this doesn't exist yet, but it seems reasonable as a principle. + In case the `/` entry were to appear in multiple layers, we'd use the + most-derived layer in which it is present (as per the logic in the previous + section). + - otherwise: + - we assume that the root directory is owned by root:root and has `a+rx` + permissions (ie: `0555`). This matches the behaviour of podman. Note in + particular: podman uses `0555`, not `0755`: the root directory is not + (nominally) writable by the root user. + - the mtime of the root directory is taken to be equal to the most recent + file in the entire system, that is: the highest numerical value of any + mtime on any inode. The rationale is that this is usually a very good + proxy for "when was the (most-derived) container image created". diff --git a/src/image.rs b/src/image.rs index b75ebfe..b03049a 100644 --- a/src/image.rs +++ b/src/image.rs @@ -155,6 +155,20 @@ impl Directory { pub fn remove_all(&mut self) { self.entries.clear(); } + + pub fn newest_file(&self) -> i64 { + let mut newest = self.stat.st_mtim_sec; + for DirEnt { inode, .. } in &self.entries { + let mtime = match inode { + Inode::Leaf(ref leaf) => leaf.stat.st_mtim_sec, + Inode::Directory(ref dir) => dir.newest_file(), + }; + if mtime > newest { + newest = mtime; + } + } + newest + } } pub struct FileSystem { @@ -172,10 +186,10 @@ impl FileSystem { FileSystem { root: Directory { stat: Stat { - st_mode: 0o755, - st_uid: 0, - st_gid: 0, - st_mtim_sec: 0, + st_mode: u32::MAX, // assigned later + st_uid: u32::MAX, // assigned later + st_gid: u32::MAX, // assigned later + st_mtim_sec: -1, // assigned later xattrs: RefCell::new(BTreeMap::new()), }, entries: vec![], @@ -246,6 +260,25 @@ impl FileSystem { todo!(); } } + + pub fn done(&mut self) { + // We need to look at the root entry and deal with the "assign later" fields + let stat = &mut self.root.stat; + + if stat.st_mode == u32::MAX { + stat.st_mode = 0o555; + } + if stat.st_uid == u32::MAX { + stat.st_uid = 0; + } + if stat.st_gid == u32::MAX { + stat.st_gid = 0; + } + if stat.st_mtim_sec == -1 { + // write this in full to avoid annoying the borrow checker + self.root.stat.st_mtim_sec = self.root.newest_file(); + } + } } pub fn mkcomposefs(filesystem: FileSystem) -> Result> { diff --git a/src/oci/image.rs b/src/oci/image.rs index 098c1ad..ffec4e9 100644 --- a/src/oci/image.rs +++ b/src/oci/image.rs @@ -65,6 +65,7 @@ pub fn compose_filesystem(repo: &Repository, layers: &[String]) -> Result Date: Fri, 15 Nov 2024 16:54:30 +0100 Subject: [PATCH 5/5] fs: Add code to import from a filesystem src/fs.rs contains code for writing the in-memory filesystem tree to a directory on disk, so let's add the other direction: converting an on-disk directory to an in-memory filesystem tree. This will let us scan container images from inside containers. This is necessary because we can't get access to the OCI layer tarballs during a container build (even from a later stage in a multi-stage build) but we can bindmount the root filesystem. See https://github.com/containers/buildah/issues/5837 With our recent changes to how we handle metadata on the root directory we should now be producing the same image on the inside and the outside, which gives us a nice way to produce a UKI with a built-in `composefs=` command-line parameter. Add a new 'unified' example. This does the container build as a single `podman build` command with no special arguments. Closes #34 Signed-off-by: Allison Karlitskaya --- examples/unified/.gitignore | 5 + examples/unified/Containerfile | 48 ++++ examples/unified/build | 35 +++ examples/unified/empty | 0 examples/unified/extra/etc/resolv.conf | 1 + .../lib/dracut/dracut.conf.d/37composefs.conf | 6 + .../composefs-pivot-sysroot.service | 34 +++ .../modules.d/37composefs/module-setup.sh | 20 ++ .../kernel/install.conf.d/37composefs.conf | 2 + .../usr/lib/systemd/network/37-wired.network | 9 + .../37-composefs.conf | 6 + examples/unified/fix-verity | 59 +++++ examples/unified/make-image | 19 ++ examples/unified/repart.d/01-esp.conf | 6 + examples/unified/repart.d/02-sysroot.conf | 6 + examples/unified/run | 12 + src/bin/cfsctl.rs | 17 +- src/fs.rs | 235 +++++++++++++++++- 18 files changed, 514 insertions(+), 6 deletions(-) create mode 100644 examples/unified/.gitignore create mode 100644 examples/unified/Containerfile create mode 100755 examples/unified/build create mode 100644 examples/unified/empty create mode 120000 examples/unified/extra/etc/resolv.conf create mode 100644 examples/unified/extra/usr/lib/dracut/dracut.conf.d/37composefs.conf create mode 100644 examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot.service create mode 100755 examples/unified/extra/usr/lib/dracut/modules.d/37composefs/module-setup.sh create mode 100644 examples/unified/extra/usr/lib/kernel/install.conf.d/37composefs.conf create mode 100644 examples/unified/extra/usr/lib/systemd/network/37-wired.network create mode 100644 examples/unified/extra/usr/lib/systemd/system/systemd-growfs-root.service.d/37-composefs.conf create mode 100755 examples/unified/fix-verity create mode 100755 examples/unified/make-image create mode 100644 examples/unified/repart.d/01-esp.conf create mode 100644 examples/unified/repart.d/02-sysroot.conf create mode 100755 examples/unified/run diff --git a/examples/unified/.gitignore b/examples/unified/.gitignore new file mode 100644 index 0000000..acef31e --- /dev/null +++ b/examples/unified/.gitignore @@ -0,0 +1,5 @@ +/cfsctl +/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot +/fix-verity.efi +/image.qcow2 +/tmp/ diff --git a/examples/unified/Containerfile b/examples/unified/Containerfile new file mode 100644 index 0000000..3073ed5 --- /dev/null +++ b/examples/unified/Containerfile @@ -0,0 +1,48 @@ +# Need 6.12 kernel from rawhide +FROM fedora:rawhide AS base +COPY extra / +COPY cfsctl /usr/bin +RUN --mount=type=cache,target=/var/cache/libdnf5 < /etc/kernel/cmdline +EOF +RUN --mount=type=cache,target=/var/cache/libdnf5 < tmp/efi/loader/loader.conf +mkdir -p tmp/efi/EFI/BOOT tmp/efi/EFI/systemd +cp /usr/lib/systemd/boot/efi/systemd-bootx64.efi tmp/efi/EFI/systemd +cp /usr/lib/systemd/boot/efi/systemd-bootx64.efi tmp/efi/EFI/BOOT/BOOTX64.EFI +${CFSCTL} oci prepare-boot "${IMAGE_ID}" tmp/efi + +fakeroot ./make-image +qemu-img convert -f raw tmp/image.raw -O qcow2 image.qcow2 +./fix-verity image.qcow2 # https://github.com/tytso/e2fsprogs/issues/201 diff --git a/examples/unified/empty b/examples/unified/empty new file mode 100644 index 0000000..e69de29 diff --git a/examples/unified/extra/etc/resolv.conf b/examples/unified/extra/etc/resolv.conf new file mode 120000 index 0000000..697ba64 --- /dev/null +++ b/examples/unified/extra/etc/resolv.conf @@ -0,0 +1 @@ +../run/systemd/resolve/stub-resolv.conf \ No newline at end of file diff --git a/examples/unified/extra/usr/lib/dracut/dracut.conf.d/37composefs.conf b/examples/unified/extra/usr/lib/dracut/dracut.conf.d/37composefs.conf new file mode 100644 index 0000000..1defe5d --- /dev/null +++ b/examples/unified/extra/usr/lib/dracut/dracut.conf.d/37composefs.conf @@ -0,0 +1,6 @@ +# we want to make sure the virtio disk drivers get included +hostonly=no + +# we need to force these in via the initramfs because we don't have modules in +# the base image +force_drivers+=" virtio_net vfat " diff --git a/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot.service b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot.service new file mode 100644 index 0000000..3ba0562 --- /dev/null +++ b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/composefs-pivot-sysroot.service @@ -0,0 +1,34 @@ +# Copyright (C) 2013 Colin Walters +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library. If not, see . + +[Unit] +DefaultDependencies=no +ConditionKernelCommandLine=composefs +ConditionPathExists=/etc/initrd-release +After=sysroot.mount +Requires=sysroot.mount +Before=initrd-root-fs.target +Before=initrd-switch-root.target + +OnFailure=emergency.target +OnFailureJobMode=isolate + +[Service] +Type=oneshot +ExecStart=/usr/bin/composefs-pivot-sysroot +StandardInput=null +StandardOutput=journal +StandardError=journal+console +RemainAfterExit=yes diff --git a/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/module-setup.sh b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/module-setup.sh new file mode 100755 index 0000000..c4186c6 --- /dev/null +++ b/examples/unified/extra/usr/lib/dracut/modules.d/37composefs/module-setup.sh @@ -0,0 +1,20 @@ +#!/usr/bin/bash + +check() { + return 0 +} + +depends() { + return 0 +} + +install() { + inst \ + "${moddir}/composefs-pivot-sysroot" /bin/composefs-pivot-sysroot + inst \ + "${moddir}/composefs-pivot-sysroot.service" \ + "${systemdsystemunitdir}/composefs-pivot-sysroot.service" + + $SYSTEMCTL -q --root "${initdir}" add-wants \ + 'initrd-root-fs.target' 'composefs-pivot-sysroot.service' +} diff --git a/examples/unified/extra/usr/lib/kernel/install.conf.d/37composefs.conf b/examples/unified/extra/usr/lib/kernel/install.conf.d/37composefs.conf new file mode 100644 index 0000000..4d12c4e --- /dev/null +++ b/examples/unified/extra/usr/lib/kernel/install.conf.d/37composefs.conf @@ -0,0 +1,2 @@ +layout = uki +uki_generator = ukify diff --git a/examples/unified/extra/usr/lib/systemd/network/37-wired.network b/examples/unified/extra/usr/lib/systemd/network/37-wired.network new file mode 100644 index 0000000..e4e05fd --- /dev/null +++ b/examples/unified/extra/usr/lib/systemd/network/37-wired.network @@ -0,0 +1,9 @@ +[Match] +Type=ether + +[Link] +RequiredForOnline=routable + +[Network] +DHCP=yes + diff --git a/examples/unified/extra/usr/lib/systemd/system/systemd-growfs-root.service.d/37-composefs.conf b/examples/unified/extra/usr/lib/systemd/system/systemd-growfs-root.service.d/37-composefs.conf new file mode 100644 index 0000000..c387c18 --- /dev/null +++ b/examples/unified/extra/usr/lib/systemd/system/systemd-growfs-root.service.d/37-composefs.conf @@ -0,0 +1,6 @@ +# Make sure we grow the right root filesystem + +[Service] +ExecStart= +ExecStart=/usr/lib/systemd/systemd-growfs /sysroot + diff --git a/examples/unified/fix-verity b/examples/unified/fix-verity new file mode 100755 index 0000000..783a49a --- /dev/null +++ b/examples/unified/fix-verity @@ -0,0 +1,59 @@ +#!/bin/sh + +# workaround for https://github.com/tytso/e2fsprogs/issues/201 + +set -eux + +# We use a custom UKI with an initramfs containing a script that remounts +# /sysroot read-write and enables fs-verity on all of the objects in +# /composefs/objects. +# +# The first time we're run (or if we are modified) we (re-)generate the UKI. +# This is done inside of a container (for independence from the host OS). + +image_file="$1" + +if [ "$0" -nt fix-verity.efi ]; then + podman run --rm -i fedora > tmp/fix-verity.efi <<'EOF' + set -eux + + cat > /tmp/fix-verity.sh <<'EOS' + mount -o remount,rw /sysroot + ( + cd /sysroot/composefs/objects + echo >&2 'Enabling fsverity on composefs objects' + for i in */*; do + fsverity enable $i; + done + echo >&2 'done!' + ) + umount /sysroot + sync + poweroff -ff +EOS + + ( + dnf --setopt keepcache=1 install -y \ + kernel binutils systemd-boot-unsigned btrfs-progs fsverity-utils + dracut \ + --uefi \ + --no-hostonly \ + --install 'sync fsverity' \ + --include /tmp/fix-verity.sh /lib/dracut/hooks/pre-pivot/fix-verity.sh \ + --kver "$(rpm -q kernel-core --qf '%{VERSION}-%{RELEASE}.%{ARCH}')" \ + --kernel-cmdline="root=PARTLABEL=root-x86-64 console=ttyS0" \ + /tmp/fix-verity.efi + ) >&2 + + cat /tmp/fix-verity.efi +EOF + mv tmp/fix-verity.efi fix-verity.efi +fi + +qemu-system-x86_64 \ + -nographic \ + -m 4096 \ + -enable-kvm \ + -bios /usr/share/edk2/ovmf/OVMF_CODE.fd \ + -drive file="$1",if=virtio,media=disk \ + -kernel fix-verity.efi diff --git a/examples/unified/make-image b/examples/unified/make-image new file mode 100755 index 0000000..ff05a0f --- /dev/null +++ b/examples/unified/make-image @@ -0,0 +1,19 @@ +#!/bin/sh + +set -eux + +chown -R 0:0 tmp/sysroot +chcon -R system_u:object_r:usr_t:s0 tmp/sysroot/composefs +chcon system_u:object_r:var_t:s0 tmp/sysroot/var + +> tmp/image.raw +SYSTEMD_REPART_MKFS_OPTIONS_EXT4='-O verity' \ + systemd-repart \ + --empty=require \ + --size=auto \ + --dry-run=no \ + --no-pager \ + --offline=yes \ + --root=tmp \ + --definitions=repart.d \ + tmp/image.raw diff --git a/examples/unified/repart.d/01-esp.conf b/examples/unified/repart.d/01-esp.conf new file mode 100644 index 0000000..67f93e1 --- /dev/null +++ b/examples/unified/repart.d/01-esp.conf @@ -0,0 +1,6 @@ +[Partition] +Type=esp +Format=vfat +CopyFiles=/efi:/ +SizeMinBytes=512M +SizeMaxBytes=512M diff --git a/examples/unified/repart.d/02-sysroot.conf b/examples/unified/repart.d/02-sysroot.conf new file mode 100644 index 0000000..65f289e --- /dev/null +++ b/examples/unified/repart.d/02-sysroot.conf @@ -0,0 +1,6 @@ +[Partition] +Type=root +Format=ext4 +SizeMinBytes=10G +SizeMaxBytes=10G +CopyFiles=/sysroot:/ diff --git a/examples/unified/run b/examples/unified/run new file mode 100755 index 0000000..5742835 --- /dev/null +++ b/examples/unified/run @@ -0,0 +1,12 @@ +#!/bin/sh + +set -eux + +cd "${0%/*}" + +qemu-system-x86_64 \ + -m 4096 \ + -enable-kvm \ + -bios /usr/share/edk2/ovmf/OVMF_CODE.fd \ + -drive file=image.qcow2,if=virtio,cache=unsafe \ + -nic user,model=virtio-net-pci diff --git a/src/bin/cfsctl.rs b/src/bin/cfsctl.rs index 2437b2b..d62a03e 100644 --- a/src/bin/cfsctl.rs +++ b/src/bin/cfsctl.rs @@ -73,7 +73,9 @@ enum Command { /// Perform garbage collection GC, /// Imports a composefs image (unsafe!) - ImportImage { reference: String }, + ImportImage { + reference: String, + }, /// Commands for dealing with OCI layers Oci { #[clap(subcommand)] @@ -86,6 +88,12 @@ enum Command { /// the mountpoint mountpoint: String, }, + CreateImage { + path: PathBuf, + }, + CreateDumpfile { + path: PathBuf, + }, } fn main() -> Result<()> { @@ -165,6 +173,13 @@ fn main() -> Result<()> { oci::prepare_boot(&repo, name, None, &output)?; } }, + Command::CreateImage { ref path } => { + let image_id = composefs::fs::create_image(path, Some(&repo))?; + println!("{}", hex::encode(image_id)); + } + Command::CreateDumpfile { ref path } => { + composefs::fs::create_dumpfile(path)?; + } Command::Mount { name, mountpoint } => { repo.mount(&name, &mountpoint)?; } diff --git a/src/fs.rs b/src/fs.rs index cab30ff..5e7b08c 100644 --- a/src/fs.rs +++ b/src/fs.rs @@ -1,16 +1,31 @@ -use std::{ffi::OsStr, mem::MaybeUninit, path::Path}; +use std::{ + cell::RefCell, + collections::{BTreeMap, HashMap}, + ffi::OsString, + ffi::{CStr, OsStr}, + mem::MaybeUninit, + os::unix::ffi::{OsStrExt, OsStringExt}, + path::Path, + rc::Rc, +}; -use anyhow::Result; +use anyhow::{bail, ensure, Result}; use rustix::{ - fd::OwnedFd, - fs::{fdatasync, linkat, mkdirat, mknodat, openat, symlinkat, AtFlags, FileType, OFlags, CWD}, + fd::{AsFd, OwnedFd}, + fs::{ + fdatasync, fstat, getxattr, linkat, listxattr, mkdirat, mknodat, openat, readlinkat, + symlinkat, AtFlags, Dir, FileType, Mode, OFlags, CWD, + }, io::{read_uninit, write, Errno}, }; use crate::{ - image::{DirEnt, Directory, Inode, Leaf, LeafContent, Stat}, + fsverity::{digest::FsVerityHasher, Sha256HashValue}, + image::{DirEnt, Directory, FileSystem, Inode, Leaf, LeafContent, Stat}, repository::Repository, + selabel::selabel, util::proc_self_fd, + INLINE_CONTENT_MAX, }; fn set_file_contents(dirfd: &OwnedFd, name: &OsStr, stat: &Stat, data: &[u8]) -> Result<()> { @@ -97,7 +112,217 @@ fn write_directory_contents(dir: &Directory, fd: &OwnedFd, repo: &Repository) -> Ok(()) } +// NB: hardlinks not supported pub fn write_to_path(repo: &Repository, dir: &Directory, output_dir: &Path) -> Result<()> { let fd = openat(CWD, output_dir, OFlags::PATH | OFlags::DIRECTORY, 0.into())?; write_directory_contents(dir, &fd, repo) } + +pub struct FilesystemReader<'repo> { + st_dev: u64, + repo: Option<&'repo Repository>, + inodes: HashMap>, + root_mtime: i64, +} + +impl<'repo> FilesystemReader<'repo> { + fn read_xattrs(&mut self, fd: &OwnedFd) -> Result, Box<[u8]>>> { + // flistxattr() and fgetxattr() don't with with O_PATH fds, so go via /proc/self/fd. Note: + // we want the symlink-following version of this call, which produces the correct behaviour + // even when trying to read xattrs from symlinks themselves. See + // https://gist.github.com/allisonkarlitskaya/7a80f2ebb3314d80f45c653a1ba0e398 + let filename = proc_self_fd(fd); + + let mut xattrs = BTreeMap::new(); + + let names_size = listxattr(&filename, &mut [])?; + let mut names = vec![0; names_size]; + let actual_names_size = listxattr(&filename, &mut names)?; + ensure!( + actual_names_size == names.len(), + "xattrs changed during read" + ); + + let names: Vec = names.into_iter().map(|c| c as u8).collect(); // fml + + let mut buffer = [0; 65536]; + for name in names.split_inclusive(|c| *c == 0) { + let name = CStr::from_bytes_with_nul(name)?; + let value_size = getxattr(&filename, name, &mut buffer)?; + let key = Box::from(OsStr::from_bytes(name.to_bytes())); + let value = Box::from(&buffer[..value_size]); + xattrs.insert(key, value); + } + + Ok(xattrs) + } + + fn stat(&mut self, fd: &OwnedFd, ifmt: FileType) -> Result<(rustix::fs::Stat, Stat)> { + let buf = fstat(fd)?; + + ensure!( + FileType::from_raw_mode(buf.st_mode) == ifmt, + "File type changed + between readdir() and fstat()" + ); + + let mtime = buf.st_mtime as i64; + + if buf.st_dev != self.st_dev { + if self.st_dev == u64::MAX { + self.st_dev = buf.st_dev; + } else { + bail!("Attempting to cross devices while importing filesystem"); + } + } else { + // The root mtime is equal to the most recent mtime of any inode *except* the root + // directory. Because self.st_dev is unset at first, we know we're in this branch only + // if this is the second (or later) inode we process (ie: not the root directory). + if mtime > self.root_mtime { + self.root_mtime = mtime; + } + } + + Ok(( + buf, + Stat { + st_mode: buf.st_mode & 0o7777, + st_uid: buf.st_uid, + st_gid: buf.st_gid, + st_mtim_sec: mtime, + xattrs: RefCell::new(self.read_xattrs(fd)?), + }, + )) + } + + fn read_leaf_content(&mut self, fd: OwnedFd, buf: rustix::fs::Stat) -> Result { + let content = match FileType::from_raw_mode(buf.st_mode) { + FileType::Directory | FileType::Unknown => unreachable!(), + FileType::RegularFile => { + let mut buffer = vec![MaybeUninit::uninit(); buf.st_size as usize]; + let (data, _) = read_uninit(fd, &mut buffer)?; + + if buf.st_size > INLINE_CONTENT_MAX as i64 { + let id = if let Some(repo) = self.repo { + repo.ensure_object(data)? + } else { + FsVerityHasher::hash(data) + }; + LeafContent::ExternalFile(id, buf.st_size as u64) + } else { + LeafContent::InlineFile(Vec::from(data)) + } + } + FileType::Symlink => { + let target = readlinkat(fd, "", [])?; + LeafContent::Symlink(OsString::from_vec(target.into_bytes())) + } + FileType::CharacterDevice => LeafContent::CharacterDevice(buf.st_rdev), + FileType::BlockDevice => LeafContent::BlockDevice(buf.st_rdev), + FileType::Fifo => LeafContent::Fifo, + FileType::Socket => LeafContent::Socket, + }; + Ok(content) + } + + fn read_leaf(&mut self, dirfd: &OwnedFd, name: &OsStr, ifmt: FileType) -> Result> { + let oflags = match ifmt { + FileType::RegularFile => OFlags::RDONLY, + _ => OFlags::PATH, + }; + + let fd = openat( + dirfd, + name, + oflags | OFlags::NOFOLLOW | OFlags::CLOEXEC, + Mode::empty(), + )?; + + let (buf, stat) = self.stat(&fd, ifmt)?; + + if let Some(leafref) = self.inodes.get(&buf.st_ino) { + Ok(Rc::clone(leafref)) + } else { + let content = self.read_leaf_content(fd, buf)?; + let leaf = Rc::new(Leaf { stat, content }); + if buf.st_nlink > 1 { + self.inodes.insert(buf.st_ino, Rc::clone(&leaf)); + } + Ok(leaf) + } + } + + pub fn read_directory(&mut self, dirfd: impl AsFd, name: &OsStr) -> Result { + let fd = openat( + dirfd, + name, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::NOFOLLOW | OFlags::CLOEXEC, + Mode::empty(), + )?; + + let (_, stat) = self.stat(&fd, FileType::Directory)?; + let mut directory = Directory { + stat, + entries: vec![], + }; + + for item in Dir::read_from(&fd)? { + let entry = item?; + let name = OsStr::from_bytes(entry.file_name().to_bytes()); + + if name == "." || name == ".." { + continue; + } + + let inode = self.read_inode(&fd, name, entry.file_type())?; + directory.insert(name, inode); + } + + Ok(directory) + } + + fn read_inode(&mut self, dirfd: &OwnedFd, name: &OsStr, ifmt: FileType) -> Result { + if ifmt == FileType::Directory { + Ok(Inode::Directory(Box::new( + self.read_directory(dirfd, name)?, + ))) + } else { + Ok(Inode::Leaf(self.read_leaf(dirfd, name, ifmt)?)) + } + } +} + +pub fn read_from_path(path: &Path, repo: Option<&Repository>) -> Result { + let mut reader = FilesystemReader { + repo, + inodes: HashMap::new(), + st_dev: u64::MAX, + root_mtime: 0, + }; + let mut fs = FileSystem { + root: reader.read_directory(CWD, path.as_os_str())?, + }; + fs.root.stat.st_mtim_sec = reader.root_mtime; + + // We can only relabel if we have the repo because we need to read the config and policy files + if let Some(repo) = repo { + selabel(&mut fs, repo)?; + } + + Ok(fs) +} + +pub fn create_image(path: &Path, repo: Option<&Repository>) -> Result { + let fs = read_from_path(path, repo)?; + let image = super::image::mkcomposefs(fs)?; + if let Some(repo) = repo { + Ok(repo.write_image(None, &image)?) + } else { + Ok(FsVerityHasher::hash(&image)) + } +} + +pub fn create_dumpfile(path: &Path) -> Result<()> { + let fs = read_from_path(path, None)?; + super::dumpfile::write_dumpfile(&mut std::io::stdout(), &fs) +}