Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raw_exec: make raw exec driver work with cgroups v2 #12419

Merged
merged 2 commits into from
Apr 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/12419.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
Add support for cgroups v2 in raw_exec driver
```
3 changes: 2 additions & 1 deletion .github/workflows/test-core.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ jobs:
run: |
make bootstrap
make generate-all
make test-nomad-module
sudo sed -i 's!Defaults!#Defaults!g' /etc/sudoers
sudo -E env "PATH=$PATH" make test-nomad-module
Copy link
Member Author

@shoenig shoenig Mar 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TestFS_Log checks the alloc becomes complete, which it doesn't because it gets blocked by the missing cgroup for cpuset unless running as root.

tests-pkgs:
runs-on: ubuntu-20.04
timeout-minutes: 30
Expand Down
1 change: 1 addition & 0 deletions api/fs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ func TestFS_Logs(t *testing.T) {
testutil.Parallel(t)
require := require.New(t)
rpcPort := 0

c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) {
rpcPort = c.Ports.RPC
c.Client = &testutil.ClientConfig{
Expand Down
1 change: 1 addition & 0 deletions api/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ type Node struct {
Links map[string]string
Meta map[string]string
NodeClass string
CgroupParent string
Drain bool
DrainStrategy *DrainStrategy
SchedulingEligibility string
Expand Down
1 change: 0 additions & 1 deletion client/allocdir/fs_unix.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
// +build darwin dragonfly freebsd linux netbsd openbsd solaris

package allocdir

Expand Down
1 change: 1 addition & 0 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,7 @@ func (c *Client) setupNode() error {
if node.Name == "" {
node.Name, _ = os.Hostname()
}
node.CgroupParent = c.config.CgroupParent
if node.HostVolumes == nil {
if l := len(c.config.HostVolumes); l != 0 {
node.HostVolumes = make(map[string]*structs.ClientHostVolumeConfig, l)
Expand Down
1 change: 1 addition & 0 deletions client/fs_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ func TestFS_List_ACL(t *testing.T) {

func TestFS_Stream_NoAlloc(t *testing.T) {
ci.Parallel(t)
ci.SkipSlow(t, "flaky on GHA; #12358")
require := require.New(t)

// Start a client
Expand Down
8 changes: 4 additions & 4 deletions client/lib/cgutil/cgutil_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ func CgroupScope(allocID, task string) string {
return fmt.Sprintf("%s.%s.scope", allocID, task)
}

// ConfigureBasicCgroups will initialize cgroups for v1.
// ConfigureBasicCgroups will initialize a cgroup and modify config to contain
// a reference to its path.
//
// Not useful in cgroups.v2
// v1: creates a random "freezer" cgroup which can later be used for cleanup of processes.
// v2: does nothing.
func ConfigureBasicCgroups(config *lcc.Config) error {
if UseV2 {
// In v2 the default behavior is to create inherited interface files for
// all mounted subsystems automatically.
return nil
}

Expand Down
210 changes: 210 additions & 0 deletions client/lib/cgutil/group_killer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
//go:build linux

package cgutil

import (
"errors"
"fmt"
"os"
"path/filepath"
"time"

"github.com/hashicorp/go-hclog"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)

// freezer is the name of the cgroup subsystem used for stopping / starting
// a group of processes
const freezer = "freezer"

// thawed and frozen are the two states we put a cgroup in when trying to remove it
var (
thawed = &configs.Resources{Freezer: configs.Thawed}
frozen = &configs.Resources{Freezer: configs.Frozen}
)

// GroupKiller is used for SIGKILL-ing the process tree[s] of a cgroup by leveraging
// the freezer cgroup subsystem.
type GroupKiller interface {
KillGroup(cgroup *configs.Cgroup) error
}

// NewGroupKiller creates a GroupKiller with executor PID pid.
func NewGroupKiller(logger hclog.Logger, pid int) GroupKiller {
return &killer{
logger: logger.Named("group_killer"),
pid: pid,
}
}

type killer struct {
logger hclog.Logger
pid int
}

// KillGroup will SIGKILL the process tree present in cgroup, using the freezer
// subsystem to prevent further forking, etc.
func (d *killer) KillGroup(cgroup *configs.Cgroup) error {
if UseV2 {
return d.v2(cgroup)
}
return d.v1(cgroup)
}

func (d *killer) v1(cgroup *configs.Cgroup) error {
if cgroup == nil {
return errors.New("missing cgroup")
}

// the actual path to our tasks freezer cgroup
path := cgroup.Paths[freezer]

d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v1", "executor_pid", d.pid)

// move executor PID into the init freezer cgroup so we can kill the task
// pids without killing the executor (which is the process running this code,
// doing the killing)
initPath, err := cgroups.GetInitCgroupPath(freezer)
if err != nil {
return fmt.Errorf("failed to find init cgroup: %w", err)
}
m := map[string]string{freezer: initPath}
if err = cgroups.EnterPid(m, d.pid); err != nil {
return fmt.Errorf("failed to add executor pid to init cgroup: %w", err)
}

// ability to freeze the cgroup
freeze := func() {
_ = new(fs.FreezerGroup).Set(path, frozen)
}

// ability to thaw the cgroup
thaw := func() {
_ = new(fs.FreezerGroup).Set(path, thawed)
}

// do the common kill logic
if err = d.kill(path, freeze, thaw); err != nil {
return err
}

// remove the cgroup from disk
return cgroups.RemovePath(path)
}

func (d *killer) v2(cgroup *configs.Cgroup) error {
if cgroup == nil {
return errors.New("missing cgroup")
}

path := filepath.Join(CgroupRoot, cgroup.Path)

existingPIDs, err := cgroups.GetPids(path)
if err != nil {
return fmt.Errorf("failed to determine pids in cgroup: %w", err)
}

d.logger.Trace("killing processes", "cgroup_path", path, "cgroup_version", "v2", "executor_pid", d.pid, "existing_pids", existingPIDs)

mgr, err := fs2.NewManager(cgroup, "", rootless)
if err != nil {
return fmt.Errorf("failed to create v2 cgroup manager: %w", err)
}

// move executor PID into the root init.scope so we can kill the task pids
// without killing the executor (which is the process running this code, doing
// the killing)
init, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, "init.scope"), rootless)
if err != nil {
return fmt.Errorf("failed to create v2 init cgroup manager: %w", err)
}
if err = init.Apply(d.pid); err != nil {
return fmt.Errorf("failed to move executor pid into init.scope cgroup: %w", err)
}

d.logger.Trace("move of executor pid into init.scope complete", "pid", d.pid)

// ability to freeze the cgroup
freeze := func() {
_ = mgr.Freeze(configs.Frozen)
}

// ability to thaw the cgroup
thaw := func() {
_ = mgr.Freeze(configs.Thawed)
}

// do the common kill logic

if err = d.kill(path, freeze, thaw); err != nil {
return err
}

// remove the cgroup from disk
return mgr.Destroy()
}

// kill is used to SIGKILL all processes in cgroup
//
// The order of operations is
// 0. before calling this method, the executor pid has been moved outside of cgroup
// 1. freeze cgroup (so processes cannot fork further)
// 2. scan the cgroup to collect all pids
// 3. issue SIGKILL to each pid found
// 4. thaw the cgroup so processes can go die
// 5. wait on each processes until it is confirmed dead
func (d *killer) kill(cgroup string, freeze func(), thaw func()) error {
// freeze the cgroup stopping further forking
freeze()

d.logger.Trace("search for pids in", "cgroup", cgroup)

// find all the pids we intend to kill
pids, err := cgroups.GetPids(cgroup)
if err != nil {
// if we fail to get pids, re-thaw before bailing so there is at least
// a chance the processes can go die out of band
thaw()
return fmt.Errorf("failed to find pids: %w", err)
}

d.logger.Trace("send sigkill to frozen processes", "cgroup", cgroup, "pids", pids)

var processes []*os.Process

// kill the processes in cgroup
for _, pid := range pids {
Comment on lines +178 to +179
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we not expect that all the PIDs are going to be in the same process group? That is, could we kill via -pid here the same way we do for non-Linux use cases, or is this so we can avoid killing the executor even though we moved it into a different cgroup?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is honestly just copying the original implementation, but I believe this defends against sub-sub-sub processes that get daemonized. So like

Executor -> A -> B -> C

If B dies C gets owed by PID 1, not A and is no longer part of the process group ... right? 🤔

Copy link
Member

@tgross tgross Apr 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Children always get the parent's process group when forked, so the chain would be preserved even in that case (I don't think it gets changed after the fact on wait). But this implementation works, so let's keep it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is pretty in the weeds but I guess it does protect against callers of setsid or setpgid

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. 👍

p, findErr := os.FindProcess(pid)
if findErr != nil {
d.logger.Trace("failed to find process of pid to kill", "pid", pid, "error", findErr)
continue
}
processes = append(processes, p)
if killErr := p.Kill(); killErr != nil {
d.logger.Trace("failed to kill process", "pid", pid, "error", killErr)
continue
}
}

// thawed the cgroup so we can wait on each process
thaw()

// wait on each process
for _, p := range processes {
// do not capture error; errors are normal here
pState, _ := p.Wait()
d.logger.Trace("return from wait on process", "pid", p.Pid, "state", pState)
}

// cgroups are not atomic, the OS takes a moment to un-mark the cgroup as in-use;
// a tiny sleep here goes a long way for not creating noisy (but functionally benign)
// errors about removing busy cgroup
//
// alternatively we could do the removal in a loop and silence the interim errors, but meh
time.Sleep(50 * time.Millisecond)

return nil
}
13 changes: 13 additions & 0 deletions client/lib/resources/containment.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package resources

// A Containment will cleanup resources created by an executor.
type Containment interface {
// Apply enables containment on pid.
Apply(pid int) error

// Cleanup will purge executor resources like cgroups.
Cleanup() error

// GetPIDs will return the processes overseen by the Containment
GetPIDs() PIDs
}
Loading