From 3ce5492f852c4e4e07d02c9a93f0b0fffcb00184 Mon Sep 17 00:00:00 2001 From: Noel Georgi Date: Fri, 26 Jul 2024 19:21:12 +0530 Subject: [PATCH] feat: runc memfd-bind service Add a `runc-memfd-bind` service so that runc binary is not copied for every `runc` invocation. Fixes: #9007. Signed-off-by: Noel Georgi --- go.mod | 3 +- go.sum | 6 +- .../pkg/controllers/cri/runc_memfd_bind.go | 157 ++++++++++++++++++ .../runtime/v1alpha2/v1alpha2_controller.go | 3 + .../db/microsoft option rom uefi ca 2023.der | Bin 0 -> 1459 bytes 5 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 internal/app/machined/pkg/controllers/cri/runc_memfd_bind.go create mode 100644 internal/pkg/secureboot/database/certs/db/microsoft option rom uefi ca 2023.der diff --git a/go.mod b/go.mod index 3ec1e57f84..da1c584f7e 100644 --- a/go.mod +++ b/go.mod @@ -109,6 +109,7 @@ require ( github.com/nberlee/go-netstat v0.1.2 github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.0 + github.com/opencontainers/runc v1.2.0-rc.2 github.com/opencontainers/runtime-spec v1.2.0 github.com/packethost/packngo v0.31.0 github.com/pelletier/go-toml/v2 v2.2.2 @@ -298,7 +299,7 @@ require ( github.com/opencontainers/selinux v1.11.0 // indirect github.com/opentracing/opentracing-go v1.2.0 // indirect github.com/peterbourgon/diskv v2.0.1+incompatible // indirect - github.com/pierrec/lz4/v4 v4.1.15 // indirect + github.com/pierrec/lz4/v4 v4.1.18 // indirect github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.0 // indirect diff --git a/go.sum b/go.sum index 5d6c469bec..7aeb1827a8 100644 --- a/go.sum +++ b/go.sum @@ -558,6 +558,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= +github.com/opencontainers/runc v1.2.0-rc.2 h1:5P32s2x9w1gAk20jbkwbQCZCfqVFashCwjD1UL2Ykc4= +github.com/opencontainers/runc v1.2.0-rc.2/go.mod h1:H8njh/SD+WY9bYMmVsEEWDJgJdviOSDjNeXMjeNbYCE= github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= @@ -574,8 +576,8 @@ github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6 github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU= -github.com/pierrec/lz4/v4 v4.1.15 h1:MO0/ucJhngq7299dKLwIMtgTfbkoSPF6AoMYDd8Q4q0= -github.com/pierrec/lz4/v4 v4.1.15/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ= +github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pin/tftp/v3 v3.1.0 h1:rQaxd4pGwcAJnpId8zC+O2NX3B2/NscjDZQaqEjuE7c= github.com/pin/tftp/v3 v3.1.0/go.mod h1:xwQaN4viYL019tM4i8iecm++5cGxSqen6AJEOEyEI0w= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= diff --git a/internal/app/machined/pkg/controllers/cri/runc_memfd_bind.go b/internal/app/machined/pkg/controllers/cri/runc_memfd_bind.go new file mode 100644 index 0000000000..89451282a2 --- /dev/null +++ b/internal/app/machined/pkg/controllers/cri/runc_memfd_bind.go @@ -0,0 +1,157 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package cri + +import ( + "context" + "fmt" + "io" + "os" + "runtime" + + "github.com/cosi-project/runtime/pkg/controller" + "github.com/opencontainers/runc/libcontainer/dmz" + "go.uber.org/zap" + "golang.org/x/sys/unix" + + runtimetalos "github.com/siderolabs/talos/internal/app/machined/pkg/runtime" +) + +// RuncMemFDBindController created a locked memfd bind for the runc binary, so that it can be used instead of copying the actual runc binary everytime. +type RuncMemFDBindController struct { + V1Alpha1Mode runtimetalos.Mode +} + +// Name implements controller.Controller interface. +func (ctrl *RuncMemFDBindController) Name() string { + return "cri.RuncMemFDBindController" +} + +// Inputs implements controller.Controller interface. +func (ctrl *RuncMemFDBindController) Inputs() []controller.Input { + return nil +} + +// Outputs implements controller.Controller interface. +func (ctrl *RuncMemFDBindController) Outputs() []controller.Output { + return nil +} + +// Run implements controller.Controller interface. +func (ctrl *RuncMemFDBindController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error { + // This controller is only relevant in container mode. + if ctrl.V1Alpha1Mode == runtimetalos.ModeContainer { + return nil + } + + runcPath := "/bin/runc" + + memfdFile, err := memfdClone(runcPath) + if err != nil { + return fmt.Errorf("memfd clone: %w", err) + } + defer memfdFile.Close() //nolint:errcheck + + memfdPath := fmt.Sprintf("/proc/self/fd/%d", memfdFile.Fd()) + + // We have to open an O_NOFOLLOW|O_PATH to the memfd magic-link because we + // cannot bind-mount the memfd itself (it's in the internal kernel mount + // namespace and cross-mount-namespace bind-mounts are not allowed). This + // also requires that this program stay alive continuously for the + // magic-link to stay alive... + memfdLink, err := os.OpenFile(memfdPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("mount: failed to /proc/self/fd magic-link for memfd: %w", err) + } + defer memfdLink.Close() //nolint:errcheck + + memfdLinkFdPath := fmt.Sprintf("/proc/self/fd/%d", memfdLink.Fd()) + + exeFile, err := os.OpenFile(runcPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("mount: failed to open target runc binary path: %w", err) + } + defer exeFile.Close() //nolint:errcheck + + exeFdPath := fmt.Sprintf("/proc/self/fd/%d", exeFile.Fd()) + + err = unix.Mount(memfdLinkFdPath, exeFdPath, "", unix.MS_BIND, "") + if err != nil { + return fmt.Errorf("mount: failed to mount memfd on top of runc binary path target: %w", err) + } + + // Clean up things we don't need... + _ = exeFile.Close() //nolint:errcheck + _ = memfdLink.Close() //nolint:errcheck + + for { + select { + case <-ctx.Done(): + return cleanup(runcPath, logger) + case <-r.EventCh(): + } + + runtime.KeepAlive(memfdFile) + } +} + +// memfdClone is a memfd-only implementation of dmz.CloneBinary. +func memfdClone(path string) (*os.File, error) { + binFile, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("failed to open runc binary path: %w", err) + } + defer binFile.Close() //nolint:errcheck + + stat, err := binFile.Stat() + if err != nil { + return nil, fmt.Errorf("checking %s size: %w", path, err) + } + + size := stat.Size() + + memfd, sealFn, err := dmz.Memfd("/proc/self/exe") + if err != nil { + return nil, fmt.Errorf("creating memfd failed: %w", err) + } + + copied, err := io.Copy(memfd, binFile) + if err != nil { + return nil, fmt.Errorf("copy binary: %w", err) + } else if copied != size { + return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size) + } + + if err := sealFn(&memfd); err != nil { + return nil, fmt.Errorf("could not seal fd: %w", err) + } + + if !dmz.IsCloned(memfd) { + return nil, fmt.Errorf("cloned memfd is not properly sealed") + } + + return memfd, nil +} + +func cleanup(path string, logger *zap.Logger) error { + file, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("cleanup: failed to open runc binary path: %w", err) + } + + defer file.Close() //nolint:errcheck + + fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd()) + + // Keep umounting until we hit a umount error. + for unix.Unmount(fdPath, unix.MNT_DETACH) == nil { + // loop... + logger.Info(fmt.Sprintf("memfd-bind: path %q unmount succeeded...", path)) + } + + logger.Info(fmt.Sprintf("memfd-bind: path %q has been cleared of all old bind-mounts", path)) + + return nil +} diff --git a/internal/app/machined/pkg/runtime/v1alpha2/v1alpha2_controller.go b/internal/app/machined/pkg/runtime/v1alpha2/v1alpha2_controller.go index 0f27a63de1..0f540e1951 100644 --- a/internal/app/machined/pkg/runtime/v1alpha2/v1alpha2_controller.go +++ b/internal/app/machined/pkg/runtime/v1alpha2/v1alpha2_controller.go @@ -119,6 +119,9 @@ func (ctrl *Controller) Run(ctx context.Context, drainer *runtime.Drainer) error ValidationMode: ctrl.v1alpha1Runtime.State().Platform().Mode(), }, &config.MachineTypeController{}, + &cri.RuncMemFDBindController{ + V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(), + }, &cri.SeccompProfileController{}, &cri.SeccompProfileFileController{ V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(), diff --git a/internal/pkg/secureboot/database/certs/db/microsoft option rom uefi ca 2023.der b/internal/pkg/secureboot/database/certs/db/microsoft option rom uefi ca 2023.der new file mode 100644 index 0000000000000000000000000000000000000000..bfdda98f6df3edde8600a50b881bc9c273d79e97 GIT binary patch literal 1459 zcmXqLVqI_0#5{cgGZP~dldv%Z1B3YHH@^Lhj|y0UY#`5omyJ`a&7GD!8PUWhSQkiV}&sH>YN)M{gcCPpP>pE9yCFgG#sGXTZ8n3@pXjQ+q*kYrF!q2Qk3FX(&yf4)M9#IUw*;l&@Jv~Jed#uJHN{)HD$Z$UKVSY6#WIC z?mmp&^?a}8ti7kbA2zR&ev$9zGcm|{>5+iYjIX!Wcr2Rp#&kJLScc@IjY(G~eQC8f zSty#`u9zo(@`kC=4%^car!<{Iyz06`tCg#)w#{{BXUU$G{pEwg|6j}QFPi81BJ^l! zhs&BJj#kVyyI)RzlJoIxrP}dsqwkTPs)h{d5biY((Q@Yj*?^KH(sC0 zF~R>&O7xt?G1r2n0~zPfIDaJg=*Ih-GxA-%Pq?<8kliuwVtL_$tBcCp-=yp9Y-e-3 z>*(x|6|c|4%*epFxQQ_r7$TVle86y(LFUZc!FDTYeF3Qn|rwLUf17L~-lF&2>;saA9 zka7lzf>Nd)P=i4c8wb#ntgP&ej4WveDF#U}z5!EP7(p9Ti%N<~wX*~{Ndhw;a}zLV zy}TR#6mD$4%hs(TaIx%%z>`#yg}WO$Y74H2s{EW_?UA47r~I!q_|2l3@|{mi_Fb5< z{^Jkb&+Zqu8l}v;S9CzUu2G@tVEn|depbt>GjCqF_Tg7U^3ek>>6g3J6AwPU6F4(Z z`J|@AKhwRgifj0PCOqQ57+=6YNjt*t{F)Sf&f>QL948*O7CYz|uH3Rs+53Q`^)S%J;SD<O_`Gd(R$+umd*he``fa%B^~)m#(6?#`wc^uWl|C1A%2hcCsh_Oq|m$ z{j2TayzHl(iKQzRTgE?{pTpOx_-t?4(pRp{8FKn&`yXU~{E^sLJ{9VUrQd zQq3QcZ`8J(xUT$x`*Kcl)-C-@mzS!}niaS7FVoUDizYN?*x9=F)v#{Tdo`2M^)IKv z6t(MKI{#{$T#TJVPe(alnl$O3pZhUSdsAt}Ia#-w{-l`yR)6)y;C||x3x5I_EwbL+ Z3H