Skip to content

Commit

Permalink
feat: runc memfd-bind service
Browse files Browse the repository at this point in the history
Add a `runc-memfd-bind` service so that runc binary is not copied for
every `runc` invocation.

Fixes: #9007.

Signed-off-by: Noel Georgi <git@frezbo.dev>
  • Loading branch information
frezbo committed Jul 29, 2024
1 parent 341b55c commit 3ce5492
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 3 deletions.
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ require (
github.com/nberlee/go-netstat v0.1.2
github.com/opencontainers/go-digest v1.0.0
github.com/opencontainers/image-spec v1.1.0
github.com/opencontainers/runc v1.2.0-rc.2
github.com/opencontainers/runtime-spec v1.2.0
github.com/packethost/packngo v0.31.0
github.com/pelletier/go-toml/v2 v2.2.2
Expand Down Expand Up @@ -298,7 +299,7 @@ require (
github.com/opencontainers/selinux v1.11.0 // indirect
github.com/opentracing/opentracing-go v1.2.0 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/pierrec/lz4/v4 v4.1.15 // indirect
github.com/pierrec/lz4/v4 v4.1.18 // indirect
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/planetscale/vtprotobuf v0.6.0 // indirect
Expand Down
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
github.com/opencontainers/runc v1.2.0-rc.2 h1:5P32s2x9w1gAk20jbkwbQCZCfqVFashCwjD1UL2Ykc4=
github.com/opencontainers/runc v1.2.0-rc.2/go.mod h1:H8njh/SD+WY9bYMmVsEEWDJgJdviOSDjNeXMjeNbYCE=
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
Expand All @@ -574,8 +576,8 @@ github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6
github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs=
github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI=
github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
github.com/pierrec/lz4/v4 v4.1.15 h1:MO0/ucJhngq7299dKLwIMtgTfbkoSPF6AoMYDd8Q4q0=
github.com/pierrec/lz4/v4 v4.1.15/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ=
github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pin/tftp/v3 v3.1.0 h1:rQaxd4pGwcAJnpId8zC+O2NX3B2/NscjDZQaqEjuE7c=
github.com/pin/tftp/v3 v3.1.0/go.mod h1:xwQaN4viYL019tM4i8iecm++5cGxSqen6AJEOEyEI0w=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
Expand Down
157 changes: 157 additions & 0 deletions internal/app/machined/pkg/controllers/cri/runc_memfd_bind.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

package cri

import (
"context"
"fmt"
"io"
"os"
"runtime"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/opencontainers/runc/libcontainer/dmz"
"go.uber.org/zap"
"golang.org/x/sys/unix"

runtimetalos "github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
)

// RuncMemFDBindController created a locked memfd bind for the runc binary, so that it can be used instead of copying the actual runc binary everytime.
type RuncMemFDBindController struct {
V1Alpha1Mode runtimetalos.Mode
}

// Name implements controller.Controller interface.
func (ctrl *RuncMemFDBindController) Name() string {
return "cri.RuncMemFDBindController"
}

// Inputs implements controller.Controller interface.
func (ctrl *RuncMemFDBindController) Inputs() []controller.Input {
return nil
}

// Outputs implements controller.Controller interface.
func (ctrl *RuncMemFDBindController) Outputs() []controller.Output {
return nil
}

// Run implements controller.Controller interface.
func (ctrl *RuncMemFDBindController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
// This controller is only relevant in container mode.
if ctrl.V1Alpha1Mode == runtimetalos.ModeContainer {
return nil
}

runcPath := "/bin/runc"

memfdFile, err := memfdClone(runcPath)
if err != nil {
return fmt.Errorf("memfd clone: %w", err)
}
defer memfdFile.Close() //nolint:errcheck

memfdPath := fmt.Sprintf("/proc/self/fd/%d", memfdFile.Fd())

// We have to open an O_NOFOLLOW|O_PATH to the memfd magic-link because we
// cannot bind-mount the memfd itself (it's in the internal kernel mount
// namespace and cross-mount-namespace bind-mounts are not allowed). This
// also requires that this program stay alive continuously for the
// magic-link to stay alive...
memfdLink, err := os.OpenFile(memfdPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("mount: failed to /proc/self/fd magic-link for memfd: %w", err)
}
defer memfdLink.Close() //nolint:errcheck

memfdLinkFdPath := fmt.Sprintf("/proc/self/fd/%d", memfdLink.Fd())

exeFile, err := os.OpenFile(runcPath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("mount: failed to open target runc binary path: %w", err)
}
defer exeFile.Close() //nolint:errcheck

exeFdPath := fmt.Sprintf("/proc/self/fd/%d", exeFile.Fd())

err = unix.Mount(memfdLinkFdPath, exeFdPath, "", unix.MS_BIND, "")
if err != nil {
return fmt.Errorf("mount: failed to mount memfd on top of runc binary path target: %w", err)
}

// Clean up things we don't need...
_ = exeFile.Close() //nolint:errcheck
_ = memfdLink.Close() //nolint:errcheck

for {
select {
case <-ctx.Done():
return cleanup(runcPath, logger)
case <-r.EventCh():
}

runtime.KeepAlive(memfdFile)
}
}

// memfdClone is a memfd-only implementation of dmz.CloneBinary.
func memfdClone(path string) (*os.File, error) {
binFile, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("failed to open runc binary path: %w", err)
}
defer binFile.Close() //nolint:errcheck

stat, err := binFile.Stat()
if err != nil {
return nil, fmt.Errorf("checking %s size: %w", path, err)
}

size := stat.Size()

memfd, sealFn, err := dmz.Memfd("/proc/self/exe")
if err != nil {
return nil, fmt.Errorf("creating memfd failed: %w", err)
}

copied, err := io.Copy(memfd, binFile)
if err != nil {
return nil, fmt.Errorf("copy binary: %w", err)
} else if copied != size {
return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
}

if err := sealFn(&memfd); err != nil {
return nil, fmt.Errorf("could not seal fd: %w", err)
}

if !dmz.IsCloned(memfd) {
return nil, fmt.Errorf("cloned memfd is not properly sealed")
}

return memfd, nil
}

func cleanup(path string, logger *zap.Logger) error {
file, err := os.OpenFile(path, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("cleanup: failed to open runc binary path: %w", err)
}

defer file.Close() //nolint:errcheck

fdPath := fmt.Sprintf("/proc/self/fd/%d", file.Fd())

// Keep umounting until we hit a umount error.
for unix.Unmount(fdPath, unix.MNT_DETACH) == nil {
// loop...
logger.Info(fmt.Sprintf("memfd-bind: path %q unmount succeeded...", path))
}

logger.Info(fmt.Sprintf("memfd-bind: path %q has been cleared of all old bind-mounts", path))

return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ func (ctrl *Controller) Run(ctx context.Context, drainer *runtime.Drainer) error
ValidationMode: ctrl.v1alpha1Runtime.State().Platform().Mode(),
},
&config.MachineTypeController{},
&cri.RuncMemFDBindController{
V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(),
},
&cri.SeccompProfileController{},
&cri.SeccompProfileFileController{
V1Alpha1Mode: ctrl.v1alpha1Runtime.State().Platform().Mode(),
Expand Down
Binary file not shown.

0 comments on commit 3ce5492

Please sign in to comment.