From fa023a9e5b3fd0dd9e1ac359792958a877970fbb Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Tue, 16 Jul 2024 10:47:57 -0400 Subject: [PATCH] WIP: bootstrap: pivot into node image before bootstrapping As per https://github.com/openshift/enhancements/pull/1637, we're trying to get rid of all OpenShift-versioned components from the bootimages. This means that there will no longer be `oc`, `kubelet`, or `crio` binaries for example, which bootstrapping obviously relies on. Instead, now we change things up so that early on when booting the bootstrap node, we pull down the node image, unencapsulate it (this just means convert it back to an OSTree commit), then mount over its `/usr`, and import new `/etc` content. This is done by isolating to a different systemd target to only bring up the minimum number of services to do the pivot and then carry on with bootstrapping. This does not incur additional reboots and should be compatible with AI/ABI/SNO. But it is of course, a huge conceptual shift in how bootstrapping works. With this, we would now always be sure that we're using the same binaries as the target version as part of bootstrapping, which should alleviate some issues such as AI late-binding (see e.g. https://issues.redhat.com/browse/MGMT-16705). The big exception of course being the kernel. Relatedly, note we do persist `/usr/lib/modules` from the booted system so that loading kernel modules still works. To be conservative, the new logic only kicks in when using bootimages which do not have `oc`. This will allow us to ratchet this in more easily. Down the line, we should be able to replace some of this with `bootc apply-live` once that's available (and also works in a live environment). (See https://github.com/containers/bootc/issues/76.) For full context, see the linked enhancement and discussions there. --- .../node-image-overlay-generator | 9 ++ .../systemd/system/node-image-finish.service | 12 +++ .../systemd/system/node-image-overlay.service | 9 ++ .../systemd/system/node-image-overlay.target | 9 ++ .../systemd/system/node-image-pull.service | 16 ++++ .../files/usr/local/bin/node-image-overlay.sh | 15 ++++ .../usr/local/bin/node-image-pull.sh.template | 82 +++++++++++++++++++ pkg/asset/ignition/bootstrap/common.go | 2 +- 8 files changed, 153 insertions(+), 1 deletion(-) create mode 100755 data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-finish.service create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-pull.service create mode 100755 data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh create mode 100755 data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template diff --git a/data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator b/data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator new file mode 100755 index 00000000000..44ed434a691 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator @@ -0,0 +1,9 @@ +#!/bin/bash +set -euo pipefail + +UNIT_DIR="${1:-/tmp}" + +if ! rpm -q openshift-clients &>/dev/null; then + ln -sf "/etc/systemd/system/node-image-overlay.target" \ + "${UNIT_DIR}/default.target" +fi diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-finish.service b/data/data/bootstrap/files/etc/systemd/system/node-image-finish.service new file mode 100644 index 00000000000..371fd2412d4 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-finish.service @@ -0,0 +1,12 @@ +# This is a separate unit because in the assisted-installer flow, we only want +# `node-image-overlay.service`, not the isolating back to `multi-user.target`. + +[Unit] +Description=Node Image Finish +Requires=node-image-overlay.service +After=node-image-overlay.service + +[Service] +Type=oneshot +# and now, back to our regularly scheduled programming... +ExecStart=/usr/bin/systemctl --no-block isolate multi-user.target diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service new file mode 100644 index 00000000000..7e1ea029ee7 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service @@ -0,0 +1,9 @@ +[Unit] +Description=Node Image Overlay +Requires=node-image-pull.service +After=node-image-pull.service + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/node-image-overlay.sh +RemainAfterExit=yes diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target new file mode 100644 index 00000000000..ef52f78ed93 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target @@ -0,0 +1,9 @@ +[Unit] +Description=Node Image Overlay Target +Requires=basic.target + +# for easier debugging +Requires=sshd.service getty.target systemd-user-sessions.service + +Requires=node-image-overlay.service +Requires=node-image-finish.service diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-pull.service b/data/data/bootstrap/files/etc/systemd/system/node-image-pull.service new file mode 100644 index 00000000000..1d5b510ac2d --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-pull.service @@ -0,0 +1,16 @@ +[Unit] +Description=Node Image Pull +Requires=network.target NetworkManager.service +After=network.target + +[Service] +Type=oneshot +# we need to call ostree container (i.e. rpm-ostree), which has install_exec_t, +# but by default, we'll run as unconfined_service_t, which is not allowed that +# transition. Relabel the script itself. +ExecStartPre=chcon --reference=/usr/bin/ostree /usr/local/bin/node-image-pull.sh +ExecStart=/usr/local/bin/node-image-pull.sh +# see related XXX in node-image-pull.sh +TimeoutStartSec=infinity +MountFlags=slave +RemainAfterExit=yes diff --git a/data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh b/data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh new file mode 100755 index 00000000000..9a3ad4efb06 --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euo pipefail + +ostree_checkout=/ostree/repo/tmp/node-image +if [ ! -d "${ostree_checkout}" ]; then + ostree_checkout=/var/ostree-container/checkout +fi + +# keep /usr/lib/modules from the booted deployment for kernel modules +mount -o bind,ro "/usr/lib/modules" "${ostree_checkout}/usr/lib/modules" +mount -o rbind,ro "${ostree_checkout}/usr" /usr +rsync -a "${ostree_checkout}/usr/etc/" /etc + +# reload the new policy +semodule -R diff --git a/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template b/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template new file mode 100755 index 00000000000..97c76c5079d --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template @@ -0,0 +1,82 @@ +#!/bin/bash +set -euo pipefail + +# shellcheck source=release-image.sh.template +. /usr/local/bin/release-image.sh + +# yuck... this is a good argument for renaming the node image to just `node` in both OCP and OKD +coreos_img=rhel-coreos +{{ if .IsOKD }} +coreos_img=stream-coreos +{{ end }} +# XXX: Unset NOTIFY_SOCKET for podman to workaround an outstanding bug in +# RHEL. When it sees the socket, it wants to keep extending the service start +# timeout. It writes to stderr, but we use `--quiet` which leaves it null, +# so it hits SIGSEGV. To work around not having timeout extensions; we use +# TimeoutStartSec=infinity. +# This is fixed upstream by https://github.com/containers/common/pull/1758. +# Should request backport... +while ! COREOS_IMAGE=$(unset NOTIFY_SOCKET; image_for ${coreos_img}); do + echo 'Failed to query release image; retrying...' + sleep 10 +done + +# try to do this in the system repo so we get hardlinks and the checkout is +# read-only, but fallback to using /var if we're in the live environment since +# that's truly read-only +ostree_repo=/ostree/repo +ostree_checkout="${ostree_repo}/tmp/node-image" +hardlink='-H' +if grep -q coreos.liveiso= /proc/cmdline; then + ostree_repo=/var/ostree-container/repo + ostree_checkout=/var/ostree-container/checkout + mkdir -p "${ostree_repo}" + ostree init --mode=bare --repo="${ostree_repo}" + # if there are layers, import all the content in the system repo for + # layer-level deduping + if [ -d /ostree/repo/refs/heads/ostree/container ]; then + ostree pull-local --repo="${ostree_repo}" /ostree/repo + fi + # but we won't be able to force hardlinks cross-device + hardlink='' +else + # (remember, we're MountFlags=slave) + mount -o rw,remount /sysroot +fi + +# Use ostree stack to pull the container here. This gives us efficient +# downloading with layers we already have, and also handles SELinux. +while ! ostree container image pull --authfile "/root/.docker/config.json" \ + "${ostree_repo}" ostree-unverified-image:docker://"${COREOS_IMAGE}"; do + echo 'Failed to fetch release image; retrying...' + sleep 10 +done + +# ideally, `ostree container image pull` would support `--write-ref` or a +# command to escape a pullspec, but for now it's pretty easy to tell which ref +# it is since it's the only docker one +ref=$(ostree refs --repo "${ostree_repo}" | grep ^ostree/container/image/docker) +if [ $(echo "$ref" | wc -l) != 1 ]; then + echo "Expected single docker ref, found:" + echo "$ref" + exit 1 +fi +ostree refs --repo "${ostree_repo}" "$ref" --create coreos/node-image + +# massive hack to make ostree admin config-diff work in live ISO where /etc +# is actually on a separate mount and not the deployment root proper... should +# enhance libostree for this (remember, we're MountFlags=slave) +if grep -q coreos.liveiso= /proc/cmdline; then + mount -o bind,ro /etc /ostree/deploy/*/deploy/*/etc +fi + +# get all state files in /etc; this is a cheap way to get "3-way /etc merge" semantics +etc_keep=$(ostree admin config-diff | cut -f5 -d' ' | sed -e 's,^,/usr/etc/,') + +# check out the commit +ostree checkout --repo "${ostree_repo}" ${hardlink} coreos/node-image "${ostree_checkout}" --skip-list=<(cat <<< "$etc_keep") + +# in the assisted-installer case, nuke the temporary repo to save RAM +if grep -q coreos.liveiso= /proc/cmdline; then + rm -rf "${ostree_repo}" +fi diff --git a/pkg/asset/ignition/bootstrap/common.go b/pkg/asset/ignition/bootstrap/common.go index 0638d2b246d..31cfcc8cbef 100644 --- a/pkg/asset/ignition/bootstrap/common.go +++ b/pkg/asset/ignition/bootstrap/common.go @@ -428,7 +428,7 @@ func AddStorageFiles(config *igntypes.Config, base string, uri string, templateD var mode int appendToFile := false - if parentDir == "bin" || parentDir == "dispatcher.d" { + if parentDir == "bin" || parentDir == "dispatcher.d" || parentDir == "system-generators" { mode = 0555 } else if filename == "motd" || filename == "containers.conf" { mode = 0644