Skip to content

Commit

Permalink
lib/sysroot-deploy: Add experimental support for automatic early prune
Browse files Browse the repository at this point in the history
During the early design of FCOS and RHCOS, we chose a value of 384M
for the boot partition. This turned out to be too small: some arches
other than x86_64 have larger initrds, kernel binaries, or additional
artifacts (like device tree blobs). We'll likely bump the boot partition
size in the future, but we don't want to abandon all the nodes deployed
with the current size.[[1]]

Because stale entries in `/boot` are cleaned up after new entries are
written, there is a window in the update process during which the bootfs
temporarily must host all the `(kernel, initrd)` pairs for the union of
current and new deployments.

This patch determines if the bootfs is capable of holding all the
pairs. If it can't but it could hold all the pairs from just the new
deployments, the outgoing deployments (e.g. rollbacks) are deleted
*before* new deployments are written. This is done by updating the
bootloader in two steps to maintain atomicity.

Since this is a lot of new logic in an important section of the
code, this feature is gated for now behind an environment variable
(`OSTREE_ENABLE_AUTO_EARLY_PRUNE`). Once we gain more experience with
it, we can consider turning it on by default.

This strategy increases the fallibility of the update system since one
would no longer be able to rollback to the previous deployment if a bug
is present in the bootloader update logic after auto-pruning (see [[2]]
and following). This is however mitigated by the fact that the heuristic
is opportunistic: the rollback is pruned *only if* it's the only way for
the system to update.

[1]: coreos/fedora-coreos-tracker#1247
[2]: #2670 (comment)

Closes: #2670
  • Loading branch information
jlebon committed May 1, 2023
1 parent fd3304e commit c561e61
Show file tree
Hide file tree
Showing 5 changed files with 350 additions and 3 deletions.
255 changes: 252 additions & 3 deletions src/libostree/ostree-sysroot-deploy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1925,8 +1925,8 @@ install_deployment_kernel (OstreeSysroot *sysroot,
}
else
{
if (!copy_dir_recurse(kernel_layout->boot_dfd, bootcsum_dfd, kernel_layout->devicetree_srcpath,
sysroot->debug_flags, cancellable, error))
if (!copy_dir_recurse (kernel_layout->boot_dfd, bootcsum_dfd, kernel_layout->devicetree_srcpath,
sysroot->debug_flags, cancellable, error))
return FALSE;
}
}
Expand Down Expand Up @@ -1959,6 +1959,8 @@ install_deployment_kernel (OstreeSysroot *sysroot,
}
}

/* NOTE: if adding more things in bootcsum_dfd, also update get_kernel_layout_size() */

g_autoptr(GPtrArray) overlay_initrds = NULL;
for (char **it = _ostree_deployment_get_overlay_initrds (deployment); it && *it; it++)
{
Expand Down Expand Up @@ -2487,6 +2489,247 @@ write_deployments_finish (OstreeSysroot *self,
return TRUE;
}

static gboolean
add_file_size_if_nonnull (int dfd,
const char *path,
guint64 *inout_size,
GError **error)
{
if (path == NULL)
return TRUE;

struct stat stbuf;
if (!glnx_fstatat (dfd, path, &stbuf, 0, error))
return FALSE;

*inout_size += stbuf.st_size;
return TRUE;
}

/* calculates the total size of the bootcsum dir in /boot after we would copy
* it. This reflects the logic in install_deployment_kernel(). */
static gboolean
get_kernel_layout_size (OstreeSysroot *self,
OstreeDeployment *deployment,
guint64 *out_size,
GCancellable *cancellable,
GError **error)
{
g_autofree char *deployment_dirpath = ostree_sysroot_get_deployment_dirpath (self, deployment);
glnx_autofd int deployment_dfd = -1;
if (!glnx_opendirat (self->sysroot_fd, deployment_dirpath, FALSE,
&deployment_dfd, error))
return FALSE;

g_autoptr(OstreeKernelLayout) kernel_layout = NULL;
if (!get_kernel_from_tree (self, deployment_dfd, &kernel_layout,
cancellable, error))
return FALSE;

guint64 bootdir_size = 0;
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->kernel_srcpath, &bootdir_size, error))
return FALSE;
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->initramfs_srcpath, &bootdir_size, error))
return FALSE;
if (kernel_layout->devicetree_srcpath)
{
/* These conditionals mirror the logic in install_deployment_kernel(). */
if (kernel_layout->devicetree_namever)
{
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->devicetree_srcpath, &bootdir_size, error))
return FALSE;
}
else
{
guint64 dirsize = 0;
if (!ot_get_dir_size (kernel_layout->boot_dfd, kernel_layout->devicetree_srcpath, &dirsize, cancellable, error))
return FALSE;
bootdir_size += dirsize;
}
}
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->kernel_hmac_srcpath, &bootdir_size, error))
return FALSE;
if (!add_file_size_if_nonnull (kernel_layout->boot_dfd, kernel_layout->aboot_srcpath, &bootdir_size, error))
return FALSE;

*out_size = bootdir_size;
return TRUE;
}

/* Analyze /boot and figure out if the new deployments won't fit in the
* remaining space. If they won't, check if deleting the deployments that are
* getting rotated out (e.g. the current rollback) would free up sufficient
* space. If so, call ostree_sysroot_write_deployments() to delete them. */
static gboolean
auto_early_prune_old_deployments (OstreeSysroot *self,
GPtrArray *new_deployments,
GCancellable *cancellable,
GError **error)
{
/* If we're not booted into a deployment, then this is some kind of e.g. disk
* creation/provisioning. The situation isn't as dire, so let's not resort to
* auto-pruning and instead let possible ENOSPC errors naturally bubble. */
if (self->booted_deployment == NULL)
return TRUE;

{
struct stat stbuf;
if (!glnx_fstatat (self->boot_fd, ".", &stbuf, 0, error))
return FALSE;

/* if /boot is on the same filesystem as the sysroot (which must be where
* the sysroot repo is), don't do anything */
if (stbuf.st_dev == self->repo->device)
return TRUE;
}

/* pre-emptive cleanup of any cruft in /boot to free up any wasted space */
if (!_ostree_sysroot_cleanup_bootfs (self, cancellable, error))
return FALSE;

/* tracks all the bootcsums currently in /boot */
g_autoptr(GHashTable) current_bootcsums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);

/* tracks all the bootcsums of new_deployments */
g_autoptr(GHashTable) new_bootcsums = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, NULL);

g_auto(GStrv) bootdirs = NULL;
if (!_ostree_sysroot_list_all_boot_directories (self, &bootdirs, cancellable, error))
return glnx_prefix_error (error, "listing bootcsum directories in bootfs");

for (char **it = bootdirs; it && *it; it++)
{
const char *bootdir = *it;

g_autofree char *bootcsum = NULL;
if (!_ostree_sysroot_parse_bootdir_name (bootdir, NULL, &bootcsum))
g_assert_not_reached (); /* checked in _ostree_sysroot_list_all_boot_directories() */

guint64 bootdir_size;
g_autofree char *ostree_bootdir = g_build_filename ("ostree", bootdir, NULL);
if (!ot_get_dir_size (self->boot_fd, ostree_bootdir, &bootdir_size, cancellable, error))
return FALSE;

/* for our purposes of sizing bootcsums, it's highly unlikely we need a
* guint64; cast it down to guint so we can more easily store it */
if (bootdir_size > G_MAXUINT)
{
/* If it somehow happens, don't make it fatal. this is all an
* optimization anyway, so let the deployment continue. But log it so
* that users report it and we tweak this code to handle this.
*
* An alternative is working with the block size instead, which would
* be easier to handle. But ideally, `ot_get_dir_size` would be block
* size aware too for better accuracy, which is awkward since the
* function itself is generic over directories and doesn't consider
* e.g. mount points from different filesystems. */
g_printerr ("bootcsum %s size exceeds %u; disabling auto-prune optimization\n", bootdir, G_MAXUINT);
return TRUE;
}

g_assert_cmpuint (bootdir_size, >, 0);
g_hash_table_insert (current_bootcsums, g_steal_pointer (&bootcsum), GUINT_TO_POINTER (bootdir_size));
}

/* total size of all bootcsums dirs that aren't already in /boot */
guint64 net_new_bootcsum_dirs_total_size = 0;

/* now gather all the bootcsums of the new deployments */
for (guint i = 0; i < new_deployments->len; i++)
{
OstreeDeployment *deployment = new_deployments->pdata[i];

const char *bootcsum = ostree_deployment_get_bootcsum (deployment);
gpointer bootdir_sizep = g_hash_table_lookup (current_bootcsums, bootcsum);
if (bootdir_sizep != 0)
{
g_hash_table_insert (new_bootcsums, g_strdup (bootcsum), bootdir_sizep);
continue;
}

guint64 bootdir_size;
if (!get_kernel_layout_size (self, deployment, &bootdir_size, cancellable, error))
return FALSE;

/* see similar logic in previous loop */
if (bootdir_size > G_MAXUINT)
{
g_printerr ("deployment %s kernel layout size exceeds %u; disabling auto-prune optimization\n",
ostree_deployment_get_csum (deployment), G_MAXUINT);
return TRUE;
}

g_hash_table_insert (new_bootcsums, g_strdup (bootcsum), GUINT_TO_POINTER (bootdir_size));

/* it wasn't in current_bootcsums; add */
net_new_bootcsum_dirs_total_size += bootdir_size;
}

/* get bootfs free space */
struct statvfs stvfsbuf;
if (TEMP_FAILURE_RETRY (fstatvfs (self->boot_fd, &stvfsbuf)) < 0)
return glnx_throw_errno_prefix (error, "fstatvfs(boot)");

guint64 available_size = stvfsbuf.f_bsize * stvfsbuf.f_bfree;

/* does the bootfs have enough free space for net-new bootdirs? */
if (net_new_bootcsum_dirs_total_size <= available_size)
return TRUE; /* nothing to do! */

/* OK, we would fail if we tried to write the new bootdirs. Is it salvageable?
* First, calculate how much space we could save with the bootcsums scheduled
* for removal. */
guint64 size_to_remove = 0;
GLNX_HASH_TABLE_FOREACH_KV (current_bootcsums, const char *, bootcsum, gpointer, sizep)
{
if (!g_hash_table_contains (new_bootcsums, bootcsum))
size_to_remove += GPOINTER_TO_UINT (sizep);
}

if (net_new_bootcsum_dirs_total_size > (available_size + size_to_remove))
{
/* Even if we auto-pruned, the new bootdirs wouldn't fit. Just let the
* code continue and let it hit ENOSPC. */
return TRUE;
}

g_printerr ("Insufficient space left in bootfs; updating bootloader in two steps\n");

/* Auto-pruning can salvage the situation. Calculate the set of deployments in common. */
g_autoptr(GPtrArray) common_deployments = g_ptr_array_new ();
for (guint i = 0; i < self->deployments->len; i++)
{
OstreeDeployment *deployment = self->deployments->pdata[i];
const char *bootcsum = ostree_deployment_get_bootcsum (deployment);
if (g_hash_table_contains (new_bootcsums, bootcsum))
{
g_ptr_array_add (common_deployments, deployment);
}
else
{
/* we always keep the booted deployment */
g_assert (deployment != self->booted_deployment);
}
}

/* if we're here, it means that removing some deployments is possible to gain space */
g_assert_cmpuint (common_deployments->len, <, self->deployments->len);

/* Do an initial write out where we do a pure deployment pruning, keeping
* common deployments. To be safe, disable auto-pruning to make recursion
* impossible (though the logic in this function shouldn't kick in anyway in
* that recursive call). Disable cleaning since it's an intermediate stage. */
OstreeSysrootWriteDeploymentsOpts opts = { .do_postclean = FALSE, .disable_auto_early_prune = TRUE };
if (!ostree_sysroot_write_deployments_with_options (self, common_deployments, &opts, cancellable, error))
return FALSE;

/* clean up /boot */
if (!_ostree_sysroot_cleanup_bootfs (self, cancellable, error))
return FALSE;

return TRUE;
}

/**
* ostree_sysroot_write_deployments_with_options:
* @self: Sysroot
Expand Down Expand Up @@ -2516,6 +2759,12 @@ ostree_sysroot_write_deployments_with_options (OstreeSysroot *self,
if (!_ostree_sysroot_ensure_writable (self, error))
return FALSE;

/* for now, this is gated on an environment variable */
const gboolean opted_in = (self->opt_flags & OSTREE_SYSROOT_GLOBAL_OPT_EARLY_PRUNE) > 0;
if (opted_in && !opts->disable_auto_early_prune &&
!auto_early_prune_old_deployments (self, new_deployments, cancellable, error))
return FALSE;

/* Dealing with the staged deployment is quite tricky here. This function is
* primarily concerned with writing out "finalized" deployments which have
* bootloader entries. Originally, we simply dropped the staged deployment
Expand Down Expand Up @@ -2630,7 +2879,7 @@ ostree_sysroot_write_deployments_with_options (OstreeSysroot *self,
OstreeDeployment *deployment = new_deployments->pdata[i];
g_assert (!ostree_deployment_is_staged (deployment));

if (deployment == self->booted_deployment)
if (ostree_deployment_equal (deployment, self->booted_deployment))
found_booted_deployment = TRUE;

g_autoptr(GFile) deployment_root = ostree_sysroot_get_deployment_directory (self, deployment);
Expand Down
2 changes: 2 additions & 0 deletions src/libostree/ostree-sysroot-private.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ typedef enum {
typedef enum {
/* Skip invoking `sync()` */
OSTREE_SYSROOT_GLOBAL_OPT_SKIP_SYNC = 1 << 0,
/* See https://github.com/ostreedev/ostree/pull/2847 */
OSTREE_SYSROOT_GLOBAL_OPT_EARLY_PRUNE = 1 << 1,
} OstreeSysrootGlobalOptFlags;

typedef enum {
Expand Down
1 change: 1 addition & 0 deletions src/libostree/ostree-sysroot.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ ostree_sysroot_init (OstreeSysroot *self)
{
const GDebugKey globalopt_keys[] = {
{ "skip-sync", OSTREE_SYSROOT_GLOBAL_OPT_SKIP_SYNC },
{ "early-prune", OSTREE_SYSROOT_GLOBAL_OPT_EARLY_PRUNE },
};
const GDebugKey keys[] = {
{ "mutable-deployments", OSTREE_SYSROOT_DEBUG_MUTABLE_DEPLOYMENTS },
Expand Down
1 change: 1 addition & 0 deletions src/libostree/ostree-sysroot.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ gboolean ostree_sysroot_write_deployments (OstreeSysroot *self,

typedef struct {
gboolean do_postclean;
gboolean disable_auto_early_prune;
gboolean unused_bools[7];
int unused_ints[7];
gpointer unused_ptrs[7];
Expand Down
Loading

0 comments on commit c561e61

Please sign in to comment.