From 5da1a31e945d69ddbe74d0cd2fed96c57e5d3c2f Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Mon, 28 Feb 2022 16:24:01 -0600 Subject: [PATCH] client: enable support for cgroups v2 This PR introduces support for using Nomad on systems with cgroups v2 [1] enabled as the cgroups controller mounted on /sys/fs/cgroups. Newer Linux distros like Ubuntu 21.10 are shipping with cgroups v2 only, causing problems for Nomad users. Nomad mostly "just works" with cgroups v2 due to the indirection via libcontainer, but not so for managing cpuset cgroups. Before, Nomad has been making use of a feature in v1 where a PID could be a member of more than one cgroup. In v2 this is no longer possible, and so the logic around computing cpuset values must be modified. When Nomad detects v2, it manages cpuset values in-process, rather than making use of cgroup heirarchy inheritence via shared/reserved parents. Nomad will only activate the v2 logic when it detects cgroups2 is mounted at /sys/fs/cgroups. This means on systems running in hybrid mode with cgroups2 mounted at /sys/fs/cgroups/unified (as is typical) Nomad will continue to use the v1 logic, and should operate as before. Systems that do not support cgroups v2 are also not affected. When v2 is activated, Nomad will create a parent called nomad.slice (unless otherwise configured in Client conifg), and create cgroups for tasks using naming convention -.scope. These follow the naming convention set by systemd and also used by Docker when cgroups v2 is detected. Client nodes now export a new fingerprint attribute, unique.cgroups.version which will be set to 'v1' or 'v2' to indicate the cgroups regime in use by Nomad. The new cpuset management strategy fixes #11705, where docker tasks that spawned processes on startup would "leak". In cgroups v2, the PIDs are started in the cgroup they will always live in, and thus the cause of the leak is eliminated. [1] https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html Closes #11289 Fixes #11705 #11773 #11933 --- .changelog/12274.txt | 3 + .github/workflows/test-core.yaml | 6 +- client/alloc_endpoint_test.go | 1 + client/allocrunner/testing.go | 2 +- client/client.go | 22 +- client/client_test.go | 3 +- client/config/config.go | 2 +- client/fingerprint/cgroup.go | 61 +++- client/fingerprint/cgroup_default.go | 1 - client/fingerprint/cgroup_linux.go | 22 +- client/fingerprint/cgroup_test.go | 138 +++++--- client/fingerprint/cpu_linux.go | 9 +- client/lib/cgutil/cgutil_default.go | 14 - client/lib/cgutil/cgutil_linux.go | 176 +++++----- client/lib/cgutil/cgutil_linux_test.go | 124 +++++++ client/lib/cgutil/cgutil_noop.go | 42 +++ client/lib/cgutil/cpuset_manager.go | 69 +++- client/lib/cgutil/cpuset_manager_default.go | 10 - client/lib/cgutil/cpuset_manager_test.go | 28 ++ ..._manager_linux.go => cpuset_manager_v1.go} | 184 ++++++++-- ...inux_test.go => cpuset_manager_v1_test.go} | 68 ++-- client/lib/cgutil/cpuset_manager_v2.go | 332 ++++++++++++++++++ client/lib/cgutil/cpuset_manager_v2_test.go | 90 +++++ client/testutil/driver_compatible.go | 59 ++-- client/testutil/driver_compatible_default.go | 22 ++ client/testutil/driver_compatible_linux.go | 56 +++ command/agent/config.go | 5 + drivers/docker/config.go | 4 +- drivers/docker/config_test.go | 2 +- drivers/docker/driver.go | 20 +- drivers/docker/driver_darwin.go | 2 + drivers/docker/driver_darwin_test.go | 2 + drivers/docker/driver_default.go | 1 - drivers/docker/driver_linux.go | 4 +- drivers/docker/driver_linux_test.go | 2 + drivers/docker/driver_test.go | 4 +- drivers/docker/driver_unix_test.go | 1 - drivers/docker/driver_windows.go | 2 + drivers/docker/driver_windows_test.go | 1 - drivers/docker/fingerprint.go | 7 +- drivers/docker/handle.go | 6 + drivers/docker/reconcile_cpuset.go | 125 +++++++ drivers/docker/reconcile_cpuset_noop.go | 19 + drivers/docker/reconcile_cpuset_test.go | 36 ++ .../{reconciler.go => reconcile_dangling.go} | 0 ...ler_test.go => reconcile_dangling_test.go} | 0 drivers/docker/util/stats_posix.go | 1 - drivers/docker/utils_unix_test.go | 1 - drivers/docker/utils_windows_test.go | 1 - drivers/exec/driver.go | 5 +- drivers/exec/driver_test.go | 283 ++++++++------- drivers/exec/driver_unix_test.go | 49 ++- drivers/java/driver_test.go | 63 ++-- drivers/rawexec/driver_test.go | 5 + drivers/rawexec/driver_unix_test.go | 52 +-- drivers/shared/executor/executor.go | 3 +- drivers/shared/executor/executor_linux.go | 67 +--- .../shared/executor/executor_linux_test.go | 105 +++--- drivers/shared/executor/executor_test.go | 9 + .../executor/executor_universal_linux.go | 4 +- .../executor/{client.go => grpc_client.go} | 0 .../executor/{server.go => grpc_server.go} | 0 .../executor/resource_container_linux.go | 1 + go.mod | 14 +- go.sum | 27 +- lib/cpuset/cpuset.go | 11 + lib/cpuset/cpuset_test.go | 15 +- plugins/drivers/driver.go | 2 +- plugins/drivers/testutils/exec_testing.go | 32 +- plugins/drivers/testutils/testing.go | 44 ++- .../content/docs/upgrade/upgrade-specific.mdx | 25 ++ 71 files changed, 1937 insertions(+), 669 deletions(-) create mode 100644 .changelog/12274.txt delete mode 100644 client/lib/cgutil/cgutil_default.go create mode 100644 client/lib/cgutil/cgutil_linux_test.go create mode 100644 client/lib/cgutil/cgutil_noop.go delete mode 100644 client/lib/cgutil/cpuset_manager_default.go create mode 100644 client/lib/cgutil/cpuset_manager_test.go rename client/lib/cgutil/{cpuset_manager_linux.go => cpuset_manager_v1.go} (59%) rename client/lib/cgutil/{cpuset_manager_linux_test.go => cpuset_manager_v1_test.go} (80%) create mode 100644 client/lib/cgutil/cpuset_manager_v2.go create mode 100644 client/lib/cgutil/cpuset_manager_v2_test.go create mode 100644 client/testutil/driver_compatible_default.go create mode 100644 client/testutil/driver_compatible_linux.go create mode 100644 drivers/docker/reconcile_cpuset.go create mode 100644 drivers/docker/reconcile_cpuset_noop.go create mode 100644 drivers/docker/reconcile_cpuset_test.go rename drivers/docker/{reconciler.go => reconcile_dangling.go} (100%) rename drivers/docker/{reconciler_test.go => reconcile_dangling_test.go} (100%) rename drivers/shared/executor/{client.go => grpc_client.go} (100%) rename drivers/shared/executor/{server.go => grpc_server.go} (100%) diff --git a/.changelog/12274.txt b/.changelog/12274.txt new file mode 100644 index 000000000000..3f47ae714a62 --- /dev/null +++ b/.changelog/12274.txt @@ -0,0 +1,3 @@ +```release-note:improvement +Enable support for cgroups v2 +``` diff --git a/.github/workflows/test-core.yaml b/.github/workflows/test-core.yaml index 5f2a66d3b841..672458ef3b9d 100644 --- a/.github/workflows/test-core.yaml +++ b/.github/workflows/test-core.yaml @@ -96,7 +96,7 @@ jobs: - client/devicemanager - client/dynamicplugins - client/fingerprint - # - client/lib/... + - client/lib/... - client/logmon - client/pluginmanager - client/state @@ -105,8 +105,8 @@ jobs: - client/taskenv - command - command/agent - # - drivers/docker - # - drivers/exec + - drivers/docker + - drivers/exec - drivers/java - drivers/rawexec - helper/... diff --git a/client/alloc_endpoint_test.go b/client/alloc_endpoint_test.go index 2e51a3702c46..272819ecc35c 100644 --- a/client/alloc_endpoint_test.go +++ b/client/alloc_endpoint_test.go @@ -992,6 +992,7 @@ func TestAlloc_ExecStreaming_ACL_WithIsolation_Image(t *testing.T) { // TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot asserts that token only needs // alloc-exec acl policy when chroot isolation is used func TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot(t *testing.T) { + ci.SkipSlow(t, "flaky on GHA; too much disk IO") ci.Parallel(t) if runtime.GOOS != "linux" || unix.Geteuid() != 0 { diff --git a/client/allocrunner/testing.go b/client/allocrunner/testing.go index da850719ef4e..41ee296ff838 100644 --- a/client/allocrunner/testing.go +++ b/client/allocrunner/testing.go @@ -70,7 +70,7 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu PrevAllocMigrator: allocwatcher.NoopPrevAlloc{}, DeviceManager: devicemanager.NoopMockManager(), DriverManager: drivermanager.TestDriverManager(t), - CpusetManager: cgutil.NoopCpusetManager(), + CpusetManager: new(cgutil.NoopCpusetManager), ServersContactedCh: make(chan struct{}), } return conf, cleanup diff --git a/client/client.go b/client/client.go index 826453170d8c..5b473a7fc773 100644 --- a/client/client.go +++ b/client/client.go @@ -365,7 +365,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie invalidAllocs: make(map[string]struct{}), serversContactedCh: make(chan struct{}), serversContactedOnce: sync.Once{}, - cpusetManager: cgutil.NewCpusetManager(cfg.CgroupParent, logger.Named("cpuset_manager")), + cpusetManager: cgutil.CreateCPUSetManager(cfg.CgroupParent, logger), EnterpriseClient: newEnterpriseClient(logger), } @@ -657,19 +657,23 @@ func (c *Client) init() error { // Ensure cgroups are created on linux platform if runtime.GOOS == "linux" && c.cpusetManager != nil { - err := c.cpusetManager.Init() - if err != nil { - // if the client cannot initialize the cgroup then reserved cores will not be reported and the cpuset manager - // will be disabled. this is common when running in dev mode under a non-root user for example - c.logger.Warn("could not initialize cpuset cgroup subsystem, cpuset management disabled", "error", err) - c.cpusetManager = cgutil.NoopCpusetManager() + // use the client configuration for reservable_cores if set + cores := c.config.ReservableCores + if len(cores) == 0 { + // otherwise lookup the effective cores from the parent cgroup + cores, _ = cgutil.GetCPUsFromCgroup(c.config.CgroupParent) + } + if cpuErr := c.cpusetManager.Init(cores); cpuErr != nil { + // If the client cannot initialize the cgroup then reserved cores will not be reported and the cpuset manager + // will be disabled. this is common when running in dev mode under a non-root user for example. + c.logger.Warn("failed to initialize cpuset cgroup subsystem, cpuset management disabled", "error", cpuErr) + c.cpusetManager = new(cgutil.NoopCpusetManager) } } return nil } -// reloadTLSConnections allows a client to reload its TLS configuration on the -// fly +// reloadTLSConnections allows a client to reload its TLS configuration on the fly func (c *Client) reloadTLSConnections(newConfig *nconfig.TLSConfig) error { var tlsWrap tlsutil.RegionWrapper if newConfig != nil && newConfig.EnableRPC { diff --git a/client/client_test.go b/client/client_test.go index bb99e6cc9208..c8cdb0f0a282 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/nomad/client/config" consulApi "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/fingerprint" + cstate "github.com/hashicorp/nomad/client/state" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/pluginutils/catalog" "github.com/hashicorp/nomad/helper/pluginutils/singleton" @@ -30,8 +31,6 @@ import ( psstructs "github.com/hashicorp/nomad/plugins/shared/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" - - cstate "github.com/hashicorp/nomad/client/state" "github.com/stretchr/testify/require" ) diff --git a/client/config/config.go b/client/config/config.go index 9ec256eabb6d..4767cd12e1ec 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -774,7 +774,7 @@ func DefaultConfig() *Config { CNIConfigDir: "/opt/cni/config", CNIInterfacePrefix: "eth", HostNetworks: map[string]*structs.ClientHostNetworkConfig{}, - CgroupParent: cgutil.DefaultCgroupParent, + CgroupParent: cgutil.GetCgroupParent(""), MaxDynamicPort: structs.DefaultMinDynamicPort, MinDynamicPort: structs.DefaultMaxDynamicPort, } diff --git a/client/fingerprint/cgroup.go b/client/fingerprint/cgroup.go index b0097984c79f..b24c3f0a3442 100644 --- a/client/fingerprint/cgroup.go +++ b/client/fingerprint/cgroup.go @@ -3,55 +3,86 @@ package fingerprint import ( "time" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/lib/cgutil" - - log "github.com/hashicorp/go-hclog" ) const ( + cgroupAvailable = "available" cgroupUnavailable = "unavailable" - interval = 15 + + cgroupMountPointAttribute = "unique.cgroup.mountpoint" + cgroupVersionAttribute = "unique.cgroup.version" + + cgroupDetectInterval = 15 * time.Second ) type CGroupFingerprint struct { - logger log.Logger + logger hclog.Logger lastState string mountPointDetector MountPointDetector + versionDetector CgroupVersionDetector } -// An interface to isolate calls to the cgroup library -// This facilitates testing where we can implement -// fake mount points to test various code paths +// MountPointDetector isolates calls to the cgroup library. +// +// This facilitates testing where we can implement fake mount points to test +// various code paths. type MountPointDetector interface { + // MountPoint returns a cgroup mount-point. + // + // In v1, this is one arbitrary subsystem (e.g. /sys/fs/cgroup/cpu). + // + // In v2, this is the actual root mount point (i.e. /sys/fs/cgroup). MountPoint() (string, error) } -// Implements the interface detector which calls the cgroups library directly +// DefaultMountPointDetector implements the interface detector which calls the cgroups +// library directly type DefaultMountPointDetector struct { } // MountPoint calls out to the default cgroup library. -func (b *DefaultMountPointDetector) MountPoint() (string, error) { +func (*DefaultMountPointDetector) MountPoint() (string, error) { return cgutil.FindCgroupMountpointDir() } +// CgroupVersionDetector isolates calls to the cgroup library. +type CgroupVersionDetector interface { + // CgroupVersion returns v1 or v2 depending on the cgroups version in use. + CgroupVersion() string +} + +// DefaultCgroupVersionDetector implements the version detector which calls the +// cgroups library directly. +type DefaultCgroupVersionDetector struct { +} + +func (*DefaultCgroupVersionDetector) CgroupVersion() string { + if cgutil.UseV2 { + return "v2" + } + return "v1" +} + // NewCGroupFingerprint returns a new cgroup fingerprinter -func NewCGroupFingerprint(logger log.Logger) Fingerprint { - f := &CGroupFingerprint{ +func NewCGroupFingerprint(logger hclog.Logger) Fingerprint { + return &CGroupFingerprint{ logger: logger.Named("cgroup"), lastState: cgroupUnavailable, - mountPointDetector: &DefaultMountPointDetector{}, + mountPointDetector: new(DefaultMountPointDetector), + versionDetector: new(DefaultCgroupVersionDetector), } - return f } // clearCGroupAttributes clears any node attributes related to cgroups that might // have been set in a previous fingerprint run. func (f *CGroupFingerprint) clearCGroupAttributes(r *FingerprintResponse) { - r.RemoveAttribute("unique.cgroup.mountpoint") + r.RemoveAttribute(cgroupMountPointAttribute) + r.RemoveAttribute(cgroupVersionAttribute) } // Periodic determines the interval at which the periodic fingerprinter will run. func (f *CGroupFingerprint) Periodic() (bool, time.Duration) { - return true, interval * time.Second + return true, cgroupDetectInterval } diff --git a/client/fingerprint/cgroup_default.go b/client/fingerprint/cgroup_default.go index 272540834ea8..f1e53c603885 100644 --- a/client/fingerprint/cgroup_default.go +++ b/client/fingerprint/cgroup_default.go @@ -1,5 +1,4 @@ //go:build !linux -// +build !linux package fingerprint diff --git a/client/fingerprint/cgroup_linux.go b/client/fingerprint/cgroup_linux.go index 36cefdf0f310..a951dca6552a 100644 --- a/client/fingerprint/cgroup_linux.go +++ b/client/fingerprint/cgroup_linux.go @@ -1,5 +1,4 @@ //go:build linux -// +build linux package fingerprint @@ -7,31 +6,30 @@ import ( "fmt" ) -const ( - cgroupAvailable = "available" -) - -// Fingerprint tries to find a valid cgroup mount point +// Fingerprint tries to find a valid cgroup mount point and the version of cgroups +// if a mount-point is present. func (f *CGroupFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error { mount, err := f.mountPointDetector.MountPoint() if err != nil { f.clearCGroupAttributes(resp) - return fmt.Errorf("Failed to discover cgroup mount point: %s", err) + return fmt.Errorf("failed to discover cgroup mount point: %s", err) } - // Check if a cgroup mount point was found + // Check if a cgroup mount point was found. if mount == "" { - f.clearCGroupAttributes(resp) - if f.lastState == cgroupAvailable { - f.logger.Info("cgroups are unavailable") + f.logger.Warn("cgroups are now unavailable") } f.lastState = cgroupUnavailable return nil } - resp.AddAttribute("unique.cgroup.mountpoint", mount) + // Check the version in use. + version := f.versionDetector.CgroupVersion() + + resp.AddAttribute(cgroupMountPointAttribute, mount) + resp.AddAttribute(cgroupVersionAttribute, version) resp.Detected = true if f.lastState == cgroupUnavailable { diff --git a/client/fingerprint/cgroup_test.go b/client/fingerprint/cgroup_test.go index 11119b1d09e7..23205169ab27 100644 --- a/client/fingerprint/cgroup_test.go +++ b/client/fingerprint/cgroup_test.go @@ -1,5 +1,4 @@ //go:build linux -// +build linux package fingerprint @@ -11,6 +10,7 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) // A fake mount point detector that returns an empty path @@ -41,94 +41,116 @@ func (m *MountPointDetectorEmptyMountPoint) MountPoint() (string, error) { return "", nil } -func TestCGroupFingerprint(t *testing.T) { +// A fake version detector that returns the set version. +type FakeVersionDetector struct { + version string +} + +func (f *FakeVersionDetector) CgroupVersion() string { + return f.version +} + +func newRequest(node *structs.Node) *FingerprintRequest { + return &FingerprintRequest{ + Config: new(config.Config), + Node: node, + } +} + +func newNode() *structs.Node { + return &structs.Node{ + Attributes: make(map[string]string), + } +} + +func TestCgroup_MountPoint(t *testing.T) { ci.Parallel(t) - { + t.Run("mount-point fail", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorMountPointFail{}, - } - - node := &structs.Node{ - Attributes: make(map[string]string), + mountPointDetector: new(MountPointDetectorMountPointFail), + versionDetector: new(DefaultCgroupVersionDetector), } - request := &FingerprintRequest{Config: &config.Config{}, Node: node} + request := newRequest(newNode()) var response FingerprintResponse err := f.Fingerprint(request, &response) - if err == nil { - t.Fatalf("expected an error") - } - - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a != "" { - t.Fatalf("unexpected attribute found, %s", a) - } - } + require.EqualError(t, err, "failed to discover cgroup mount point: cgroup mountpoint discovery failed") + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) - { + t.Run("mount-point available", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorValidMountPoint{}, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), - } - - request := &FingerprintRequest{Config: &config.Config{}, Node: node} + request := newRequest(newNode()) var response FingerprintResponse err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, ok := response.Attributes["unique.cgroup.mountpoint"]; !ok { - t.Fatalf("unable to find attribute: %s", a) - } - } + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", response.Attributes[cgroupMountPointAttribute]) + }) - { + t.Run("mount-point empty", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorEmptyMountPoint{}, + mountPointDetector: new(MountPointDetectorEmptyMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), - } - - request := &FingerprintRequest{Config: &config.Config{}, Node: node} var response FingerprintResponse - err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a != "" { - t.Fatalf("unexpected attribute found, %s", a) - } - } - { + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) + + t.Run("mount-point already present", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupAvailable, - mountPointDetector: &MountPointDetectorValidMountPoint{}, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), + var response FingerprintResponse + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", response.Attributes[cgroupMountPointAttribute]) + }) +} + +func TestCgroup_Version(t *testing.T) { + t.Run("version v1", func(t *testing.T) { + f := &CGroupFingerprint{ + logger: testlog.HCLogger(t), + lastState: cgroupUnavailable, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: &FakeVersionDetector{version: "v1"}, } - request := &FingerprintRequest{Config: &config.Config{}, Node: node} var response FingerprintResponse - err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a == "" { - t.Fatalf("expected attribute to be found, %s", a) + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Equal(t, "v1", response.Attributes[cgroupVersionAttribute]) + }) + + t.Run("without mount-point", func(t *testing.T) { + f := &CGroupFingerprint{ + logger: testlog.HCLogger(t), + lastState: cgroupUnavailable, + mountPointDetector: new(MountPointDetectorEmptyMountPoint), + versionDetector: &FakeVersionDetector{version: "v1"}, } - } + + var response FingerprintResponse + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) } diff --git a/client/fingerprint/cpu_linux.go b/client/fingerprint/cpu_linux.go index a31b8e60bf67..14c80faa3aa7 100644 --- a/client/fingerprint/cpu_linux.go +++ b/client/fingerprint/cpu_linux.go @@ -5,9 +5,8 @@ import ( ) func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) { - parent := req.Config.CgroupParent - if parent == "" { - parent = cgutil.DefaultCgroupParent - } - return cgutil.GetCPUsFromCgroup(parent) + // The cpuset cgroup manager is initialized (on linux), but not accessible + // from the finger-printer. So we reach in and grab the information manually. + // We may assume the hierarchy is already setup. + return cgutil.GetCPUsFromCgroup(req.Config.CgroupParent) } diff --git a/client/lib/cgutil/cgutil_default.go b/client/lib/cgutil/cgutil_default.go deleted file mode 100644 index c2fc74e0da28..000000000000 --- a/client/lib/cgutil/cgutil_default.go +++ /dev/null @@ -1,14 +0,0 @@ -//go:build !linux -// +build !linux - -package cgutil - -const ( - DefaultCgroupParent = "" -) - -// FindCgroupMountpointDir is used to find the cgroup mount point on a Linux -// system. Here it is a no-op implemtation -func FindCgroupMountpointDir() (string, error) { - return "", nil -} diff --git a/client/lib/cgutil/cgutil_linux.go b/client/lib/cgutil/cgutil_linux.go index e8b867aa1f19..349660ec3e6b 100644 --- a/client/lib/cgutil/cgutil_linux.go +++ b/client/lib/cgutil/cgutil_linux.go @@ -1,121 +1,106 @@ +//go:build linux + package cgutil import ( + "fmt" "os" "path/filepath" - cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper/uuid" "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" - "golang.org/x/sys/unix" -) - -const ( - DefaultCgroupParent = "/nomad" - SharedCpusetCgroupName = "shared" - ReservedCpusetCgroupName = "reserved" + lcc "github.com/opencontainers/runc/libcontainer/configs" ) -func GetCPUsFromCgroup(group string) ([]uint16, error) { - cgroupPath, err := getCgroupPathHelper("cpuset", group) - if err != nil { - return nil, err - } +// UseV2 indicates whether only cgroups.v2 is enabled. If cgroups.v2 is not +// enabled or is running in hybrid mode with cgroups.v1, Nomad will make use of +// cgroups.v1 +// +// This is a read-only value. +var UseV2 = cgroups.IsCgroup2UnifiedMode() + +// GetCgroupParent returns the mount point under the root cgroup in which Nomad +// will create cgroups. If parent is not set, an appropriate name for the version +// of cgroups will be used. +func GetCgroupParent(parent string) string { + if UseV2 { + return getParentV2(parent) + } + return getParentV1(parent) +} - man := cgroupFs.NewManager(&configs.Cgroup{Path: group}, map[string]string{"cpuset": cgroupPath}, false) - stats, err := man.GetStats() - if err != nil { - return nil, err +// CreateCPUSetManager creates a V1 or V2 CpusetManager depending on system configuration. +func CreateCPUSetManager(parent string, logger hclog.Logger) CpusetManager { + if UseV2 { + return NewCpusetManagerV2(getParentV2(parent), logger.Named("cpuset.v2")) } - return stats.CPUSetStats.CPUs, nil + return NewCpusetManagerV1(getParentV1(parent), logger.Named("cpuset.v1")) } -func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { - if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil { - return - } - if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil { - return +// GetCPUsFromCgroup gets the effective cpuset value for the given cgroup. +func GetCPUsFromCgroup(group string) ([]uint16, error) { + if UseV2 { + return getCPUsFromCgroupV2(getParentV2(group)) } - return cpus, mems, nil + return getCPUsFromCgroupV1(getParentV1(group)) } -// cpusetEnsureParent makes sure that the parent directories of current -// are created and populated with the proper cpus and mems files copied -// from their respective parent. It does that recursively, starting from -// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). -func cpusetEnsureParent(current string) error { - var st unix.Statfs_t +// CgroupScope returns the name of the scope for Nomad's managed cgroups for +// the given allocID and task. +// +// e.g. "-.scope" +// +// Only useful for v2. +func CgroupScope(allocID, task string) string { + return fmt.Sprintf("%s.%s.scope", allocID, task) +} - parent := filepath.Dir(current) - err := unix.Statfs(parent, &st) - if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { +// ConfigureBasicCgroups will initialize cgroups for v1. +// +// Not useful in cgroups.v2 +func ConfigureBasicCgroups(config *lcc.Config) error { + if UseV2 { + // In v2 the default behavior is to create inherited interface files for + // all mounted subsystems automatically. return nil } - // Treat non-existing directory as cgroupfs as it will be created, - // and the root cpuset directory obviously exists. - if err != nil && err != unix.ENOENT { - return &os.PathError{Op: "statfs", Path: parent, Err: err} - } - if err := cpusetEnsureParent(parent); err != nil { - return err - } - if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) { - return err - } - return cpusetCopyIfNeeded(current, parent) -} - -// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent -// directory to the current directory if the file's contents are 0 -func cpusetCopyIfNeeded(current, parent string) error { - currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + id := uuid.Generate() + // In v1 we must setup the freezer cgroup ourselves. + subsystem := "freezer" + path, err := GetCgroupPathHelperV1(subsystem, filepath.Join(DefaultCgroupV1Parent, id)) if err != nil { - return err + return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err) } - parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) - if err != nil { + if err = os.MkdirAll(path, 0755); err != nil { return err } - - if isEmptyCpuset(currentCpus) { - if err := fscommon.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { - return err - } - } - if isEmptyCpuset(currentMems) { - if err := fscommon.WriteFile(current, "cpuset.mems", parentMems); err != nil { - return err - } + config.Cgroups.Paths = map[string]string{ + subsystem: path, } return nil } -func isEmptyCpuset(str string) bool { - return str == "" || str == "\n" -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - // FindCgroupMountpointDir is used to find the cgroup mount point on a Linux // system. +// +// Note that in cgroups.v1, this returns one of many subsystems that are mounted. +// e.g. a return value of "/sys/fs/cgroup/systemd" really implies the root is +// "/sys/fs/cgroup", which is interesting on hybrid systems where the 'unified' +// subsystem is mounted as if it were a subsystem, but the actual root is different. +// (i.e. /sys/fs/cgroup/unified). +// +// As far as Nomad is concerned, UseV2 is the source of truth for which hierarchy +// to use, and that will only be a true value if cgroups.v2 is mounted on +// /sys/fs/cgroup (i.e. system is not in v1 or hybrid mode). +// +// ➜ mount -l | grep cgroup +// tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,mode=755,inode64) +// cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate) +// cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) +// cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) +// (etc.) func FindCgroupMountpointDir() (string, error) { mount, err := cgroups.GetCgroupMounts(false) if err != nil { @@ -127,3 +112,18 @@ func FindCgroupMountpointDir() (string, error) { } return mount[0].Mountpoint, nil } + +// CopyCpuset copies the cpuset.cpus value from source into destination. +func CopyCpuset(source, destination string) error { + correct, err := cgroups.ReadFile(source, "cpuset.cpus") + if err != nil { + return err + } + + err = cgroups.WriteFile(destination, "cpuset.cpus", correct) + if err != nil { + return err + } + + return nil +} diff --git a/client/lib/cgutil/cgutil_linux_test.go b/client/lib/cgutil/cgutil_linux_test.go new file mode 100644 index 000000000000..bbc18ef8c3e9 --- /dev/null +++ b/client/lib/cgutil/cgutil_linux_test.go @@ -0,0 +1,124 @@ +//go:build linux + +package cgutil + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/stretchr/testify/require" +) + +func TestUtil_GetCgroupParent(t *testing.T) { + ci.Parallel(t) + + t.Run("v1", func(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + t.Run("default", func(t *testing.T) { + exp := "/nomad" + parent := GetCgroupParent("") + require.Equal(t, exp, parent) + }) + + t.Run("configured", func(t *testing.T) { + exp := "/bar" + parent := GetCgroupParent("/bar") + require.Equal(t, exp, parent) + }) + }) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + t.Run("default", func(t *testing.T) { + exp := "nomad.slice" + parent := GetCgroupParent("") + require.Equal(t, exp, parent) + }) + + t.Run("configured", func(t *testing.T) { + exp := "abc.slice" + parent := GetCgroupParent("abc.slice") + require.Equal(t, exp, parent) + }) + }) +} + +func TestUtil_CreateCPUSetManager(t *testing.T) { + ci.Parallel(t) + + logger := testlog.HCLogger(t) + + t.Run("v1", func(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + parent := "/" + uuid.Short() + manager := CreateCPUSetManager(parent, logger) + err := manager.Init([]uint16{0}) + require.NoError(t, err) + require.NoError(t, cgroups.RemovePath(filepath.Join(CgroupRoot, parent))) + }) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + parent := uuid.Short() + ".slice" + manager := CreateCPUSetManager(parent, logger) + err := manager.Init([]uint16{0}) + require.NoError(t, err) + require.NoError(t, cgroups.RemovePath(filepath.Join(CgroupRoot, parent))) + }) +} + +func TestUtil_GetCPUsFromCgroup(t *testing.T) { + ci.Parallel(t) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + cpus, err := GetCPUsFromCgroup("system.slice") // thanks, systemd! + require.NoError(t, err) + require.NotEmpty(t, cpus) + }) +} + +func create(t *testing.T, name string) { + mgr, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, name), rootless) + require.NoError(t, err) + err = mgr.Apply(CreationPID) + require.NoError(t, err) +} + +func cleanup(t *testing.T, name string) { + err := cgroups.RemovePath(name) + require.NoError(t, err) +} + +func TestUtil_CopyCpuset(t *testing.T) { + ci.Parallel(t) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + source := uuid.Short() + ".scope" + create(t, source) + defer cleanup(t, source) + require.NoError(t, cgroups.WriteFile(filepath.Join(CgroupRoot, source), "cpuset.cpus", "0-1")) + + destination := uuid.Short() + ".scope" + create(t, destination) + defer cleanup(t, destination) + + err := CopyCpuset( + filepath.Join(CgroupRoot, source), + filepath.Join(CgroupRoot, destination), + ) + require.NoError(t, err) + + value, readErr := cgroups.ReadFile(filepath.Join(CgroupRoot, destination), "cpuset.cpus") + require.NoError(t, readErr) + require.Equal(t, "0-1", strings.TrimSpace(value)) + }) +} diff --git a/client/lib/cgutil/cgutil_noop.go b/client/lib/cgutil/cgutil_noop.go new file mode 100644 index 000000000000..91a35b14492e --- /dev/null +++ b/client/lib/cgutil/cgutil_noop.go @@ -0,0 +1,42 @@ +//go:build !linux + +package cgutil + +import ( + "github.com/hashicorp/go-hclog" +) + +const ( + // DefaultCgroupParent does not apply to non-Linux operating systems. + DefaultCgroupParent = "" +) + +// UseV2 is always false on non-Linux systems. +// +// This is a read-only value. +var UseV2 = false + +// CreateCPUSetManager creates a no-op CpusetManager for non-Linux operating systems. +func CreateCPUSetManager(string, hclog.Logger) CpusetManager { + return new(NoopCpusetManager) +} + +// FindCgroupMountpointDir returns nothing for non-Linux operating systems. +func FindCgroupMountpointDir() (string, error) { + return "", nil +} + +// GetCgroupParent returns nothing for non-Linux operating systems. +func GetCgroupParent(string) string { + return DefaultCgroupParent +} + +// GetCPUsFromCgroup returns nothing for non-Linux operating systems. +func GetCPUsFromCgroup(string) ([]uint16, error) { + return nil, nil +} + +// CgroupScope returns nothing for non-Linux operating systems. +func CgroupScope(allocID, task string) string { + return "" +} diff --git a/client/lib/cgutil/cpuset_manager.go b/client/lib/cgutil/cpuset_manager.go index 809af90fcbf1..7d16c752f84c 100644 --- a/client/lib/cgutil/cpuset_manager.go +++ b/client/lib/cgutil/cpuset_manager.go @@ -2,18 +2,26 @@ package cgutil import ( "context" + "fmt" + "path/filepath" + "strings" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/structs" ) -// CpusetManager is used to setup cpuset cgroups for each task. A pool of shared cpus is managed for -// tasks which don't require any reserved cores and a cgroup is managed seperetly for each task which -// require reserved cores. +const ( + // CgroupRoot is hard-coded in the cgroups specification. + // It only applies to linux but helpers have references to it in driver(s). + CgroupRoot = "/sys/fs/cgroup" +) + +// CpusetManager is used to setup cpuset cgroups for each task. type CpusetManager interface { - // Init should be called before any tasks are managed to ensure the cgroup parent exists and - // check that proper permissions are granted to manage cgroups. - Init() error + // Init should be called with the initial set of reservable cores before any + // allocations are managed. Ensures the parent cgroup exists and proper permissions + // are available for managing cgroups. + Init([]uint16) error // AddAlloc adds an allocation to the manager AddAlloc(alloc *structs.Allocation) @@ -26,8 +34,26 @@ type CpusetManager interface { CgroupPathFor(allocID, taskName string) CgroupPathGetter } -// CgroupPathGetter is a function which returns the cgroup path and any error which ocured during cgroup initialization. -// It should block until the cgroup has been created or an error is reported +type NoopCpusetManager struct{} + +func (n NoopCpusetManager) Init([]uint16) error { + return nil +} + +func (n NoopCpusetManager) AddAlloc(alloc *structs.Allocation) { +} + +func (n NoopCpusetManager) RemoveAlloc(allocID string) { +} + +func (n NoopCpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { + return func(context.Context) (string, error) { return "", nil } +} + +// CgroupPathGetter is a function which returns the cgroup path and any error which +// occurred during cgroup initialization. +// +// It should block until the cgroup has been created or an error is reported. type CgroupPathGetter func(context.Context) (path string, err error) type TaskCgroupInfo struct { @@ -37,20 +63,25 @@ type TaskCgroupInfo struct { Error error } -func NoopCpusetManager() CpusetManager { return noopCpusetManager{} } - -type noopCpusetManager struct{} +// identity is the "." string that uniquely identifies an +// individual instance of a task within the flat cgroup namespace +type identity string -func (n noopCpusetManager) Init() error { - return nil -} - -func (n noopCpusetManager) AddAlloc(alloc *structs.Allocation) { +func makeID(allocID, task string) identity { + return identity(fmt.Sprintf("%s.%s", allocID, task)) } -func (n noopCpusetManager) RemoveAlloc(allocID string) { +func makeScope(id identity) string { + return string(id) + ".scope" } -func (n noopCpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { - return func(context.Context) (string, error) { return "", nil } +// SplitPath determines the parent and cgroup from p. +// p must contain at least 2 elements (parent + cgroup). +// +// Handles the cgroup root if present. +func SplitPath(p string) (string, string) { + p = strings.TrimPrefix(p, CgroupRoot) + p = strings.Trim(p, "/") + parts := strings.Split(p, "/") + return parts[0], "/" + filepath.Join(parts[1:]...) } diff --git a/client/lib/cgutil/cpuset_manager_default.go b/client/lib/cgutil/cpuset_manager_default.go deleted file mode 100644 index 1f8c077baa6d..000000000000 --- a/client/lib/cgutil/cpuset_manager_default.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build !linux -// +build !linux - -package cgutil - -import ( - "github.com/hashicorp/go-hclog" -) - -func NewCpusetManager(_ string, _ hclog.Logger) CpusetManager { return noopCpusetManager{} } diff --git a/client/lib/cgutil/cpuset_manager_test.go b/client/lib/cgutil/cpuset_manager_test.go new file mode 100644 index 000000000000..88f661411d26 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_test.go @@ -0,0 +1,28 @@ +package cgutil + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/stretchr/testify/require" +) + +func TestUtil_SplitPath(t *testing.T) { + ci.Parallel(t) + + try := func(input, expParent, expCgroup string) { + parent, cgroup := SplitPath(input) + require.Equal(t, expParent, parent) + require.Equal(t, expCgroup, cgroup) + } + + // foo, /bar + try("foo/bar", "foo", "/bar") + try("/foo/bar/", "foo", "/bar") + try("/sys/fs/cgroup/foo/bar", "foo", "/bar") + + // foo, /bar/baz + try("/foo/bar/baz/", "foo", "/bar/baz") + try("foo/bar/baz", "foo", "/bar/baz") + try("/sys/fs/cgroup/foo/bar/baz", "foo", "/bar/baz") +} diff --git a/client/lib/cgutil/cpuset_manager_linux.go b/client/lib/cgutil/cpuset_manager_v1.go similarity index 59% rename from client/lib/cgutil/cpuset_manager_linux.go rename to client/lib/cgutil/cpuset_manager_v1.go index ba583c043c22..0e910705290e 100644 --- a/client/lib/cgutil/cpuset_manager_linux.go +++ b/client/lib/cgutil/cpuset_manager_v1.go @@ -1,3 +1,5 @@ +//go:build linux + package cgutil import ( @@ -10,21 +12,28 @@ import ( "sync" "time" - "github.com/opencontainers/runc/libcontainer/cgroups" - cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/structs" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +const ( + DefaultCgroupV1Parent = "/nomad" + SharedCpusetCgroupName = "shared" + ReservedCpusetCgroupName = "reserved" ) -func NewCpusetManager(cgroupParent string, logger hclog.Logger) CpusetManager { +// NewCpusetManagerV1 creates a CpusetManager compatible with cgroups.v1 +func NewCpusetManagerV1(cgroupParent string, logger hclog.Logger) CpusetManager { if cgroupParent == "" { - cgroupParent = DefaultCgroupParent + cgroupParent = DefaultCgroupV1Parent } - return &cpusetManager{ + return &cpusetManagerV1{ cgroupParent: cgroupParent, cgroupInfo: map[string]allocTaskCgroupInfo{}, logger: logger, @@ -35,7 +44,7 @@ var ( cpusetReconcileInterval = 30 * time.Second ) -type cpusetManager struct { +type cpusetManagerV1 struct { // cgroupParent relative to the cgroup root. ex. '/nomad' cgroupParent string // cgroupParentPath is the absolute path to the cgroup parent. @@ -53,7 +62,7 @@ type cpusetManager struct { logger hclog.Logger } -func (c *cpusetManager) AddAlloc(alloc *structs.Allocation) { +func (c *cpusetManagerV1) AddAlloc(alloc *structs.Allocation) { if alloc == nil || alloc.AllocatedResources == nil { return } @@ -77,14 +86,14 @@ func (c *cpusetManager) AddAlloc(alloc *structs.Allocation) { go c.signalReconcile() } -func (c *cpusetManager) RemoveAlloc(allocID string) { +func (c *cpusetManagerV1) RemoveAlloc(allocID string) { c.mu.Lock() delete(c.cgroupInfo, allocID) c.mu.Unlock() go c.signalReconcile() } -func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { +func (c *cpusetManagerV1) CgroupPathFor(allocID, task string) CgroupPathGetter { return func(ctx context.Context) (string, error) { c.mu.Lock() allocInfo, ok := c.cgroupInfo[allocID] @@ -99,15 +108,21 @@ func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { return "", fmt.Errorf("task %q not found", task) } + timer, stop := helper.NewSafeTimer(0) + defer stop() + for { + if taskInfo.Error != nil { break } + + timer.Reset(100 * time.Millisecond) if _, err := os.Stat(taskInfo.CgroupPath); os.IsNotExist(err) { select { case <-ctx.Done(): return taskInfo.CgroupPath, ctx.Err() - case <-time.After(100 * time.Millisecond): + case <-timer.C: continue } } @@ -119,24 +134,25 @@ func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { } +// task name -> task cgroup info type allocTaskCgroupInfo map[string]*TaskCgroupInfo // Init checks that the cgroup parent and expected child cgroups have been created // If the cgroup parent is set to /nomad then this will ensure that the /nomad/shared // cgroup is initialized. -func (c *cpusetManager) Init() error { - cgroupParentPath, err := getCgroupPathHelper("cpuset", c.cgroupParent) +func (c *cpusetManagerV1) Init(_ []uint16) error { + cgroupParentPath, err := GetCgroupPathHelperV1("cpuset", c.cgroupParent) if err != nil { return err } c.cgroupParentPath = cgroupParentPath // ensures that shared cpuset exists and that the cpuset values are copied from the parent if created - if err := cpusetEnsureParent(filepath.Join(cgroupParentPath, SharedCpusetCgroupName)); err != nil { + if err := cpusetEnsureParentV1(filepath.Join(cgroupParentPath, SharedCpusetCgroupName)); err != nil { return err } - parentCpus, parentMems, err := getCpusetSubsystemSettings(cgroupParentPath) + parentCpus, parentMems, err := getCpusetSubsystemSettingsV1(cgroupParentPath) if err != nil { return fmt.Errorf("failed to detect parent cpuset settings: %v", err) } @@ -155,7 +171,7 @@ func (c *cpusetManager) Init() error { return err } - if err := fscommon.WriteFile(filepath.Join(cgroupParentPath, ReservedCpusetCgroupName), "cpuset.mems", parentMems); err != nil { + if err := cgroups.WriteFile(filepath.Join(cgroupParentPath, ReservedCpusetCgroupName), "cpuset.mems", parentMems); err != nil { return err } @@ -168,7 +184,7 @@ func (c *cpusetManager) Init() error { return nil } -func (c *cpusetManager) reconcileLoop() { +func (c *cpusetManagerV1) reconcileLoop() { timer := time.NewTimer(0) if !timer.Stop() { <-timer.C @@ -189,7 +205,7 @@ func (c *cpusetManager) reconcileLoop() { } } -func (c *cpusetManager) reconcileCpusets() { +func (c *cpusetManagerV1) reconcileCpusets() { c.mu.Lock() defer c.mu.Unlock() sharedCpuset := cpuset.New(c.parentCpuset.ToSlice()...) @@ -240,13 +256,13 @@ func (c *cpusetManager) reconcileCpusets() { } // copy cpuset.mems from parent - _, parentMems, err := getCpusetSubsystemSettings(filepath.Dir(info.CgroupPath)) + _, parentMems, err := getCpusetSubsystemSettingsV1(filepath.Dir(info.CgroupPath)) if err != nil { c.logger.Error("failed to read parent cgroup settings for task", "path", info.CgroupPath, "error", err) info.Error = err continue } - if err := fscommon.WriteFile(info.CgroupPath, "cpuset.mems", parentMems); err != nil { + if err := cgroups.WriteFile(info.CgroupPath, "cpuset.mems", parentMems); err != nil { c.logger.Error("failed to write cgroup cpuset.mems setting for task", "path", info.CgroupPath, "mems", parentMems, "error", err) info.Error = err continue @@ -260,30 +276,30 @@ func (c *cpusetManager) reconcileCpusets() { } // setCgroupCpusetCPUs will compare an existing cpuset.cpus value with an expected value, overwriting the existing if different -// must hold a lock on cpusetManager.mu before calling -func (_ *cpusetManager) setCgroupCpusetCPUs(path, cpus string) error { - currentCpusRaw, err := fscommon.ReadFile(path, "cpuset.cpus") +// must hold a lock on cpusetManagerV1.mu before calling +func (_ *cpusetManagerV1) setCgroupCpusetCPUs(path, cpus string) error { + currentCpusRaw, err := cgroups.ReadFile(path, "cpuset.cpus") if err != nil { return err } if cpus != strings.TrimSpace(currentCpusRaw) { - if err := fscommon.WriteFile(path, "cpuset.cpus", cpus); err != nil { + if err := cgroups.WriteFile(path, "cpuset.cpus", cpus); err != nil { return err } } return nil } -func (c *cpusetManager) signalReconcile() { +func (c *cpusetManagerV1) signalReconcile() { select { case c.signalCh <- struct{}{}: case <-c.doneCh: } } -func (c *cpusetManager) getCpuset(group string) (cpuset.CPUSet, error) { - man := cgroupFs.NewManager( +func (c *cpusetManagerV1) getCpuset(group string) (cpuset.CPUSet, error) { + man := fs.NewManager( &configs.Cgroup{ Path: filepath.Join(c.cgroupParent, group), }, @@ -297,15 +313,119 @@ func (c *cpusetManager) getCpuset(group string) (cpuset.CPUSet, error) { return cpuset.New(stats.CPUSetStats.CPUs...), nil } -func (c *cpusetManager) getCgroupPathsForTask(allocID, task string) (absolute, relative string) { +func (c *cpusetManagerV1) getCgroupPathsForTask(allocID, task string) (absolute, relative string) { return filepath.Join(c.reservedCpusetPath(), fmt.Sprintf("%s-%s", allocID, task)), filepath.Join(c.cgroupParent, ReservedCpusetCgroupName, fmt.Sprintf("%s-%s", allocID, task)) } -func (c *cpusetManager) sharedCpusetPath() string { +func (c *cpusetManagerV1) sharedCpusetPath() string { return filepath.Join(c.cgroupParentPath, SharedCpusetCgroupName) } -func (c *cpusetManager) reservedCpusetPath() string { +func (c *cpusetManagerV1) reservedCpusetPath() string { return filepath.Join(c.cgroupParentPath, ReservedCpusetCgroupName) } + +func getCPUsFromCgroupV1(group string) ([]uint16, error) { + cgroupPath, err := GetCgroupPathHelperV1("cpuset", group) + if err != nil { + return nil, err + } + + man := fs.NewManager(&configs.Cgroup{Path: group}, map[string]string{"cpuset": cgroupPath}, false) + stats, err := man.GetStats() + if err != nil { + return nil, err + } + return stats.CPUSetStats.CPUs, nil +} + +func getParentV1(parent string) string { + if parent == "" { + return DefaultCgroupV1Parent + } + return parent +} + +// cpusetEnsureParentV1 makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParentV1(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParentV1(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeededV1(current, parent) +} + +// cpusetCopyIfNeededV1 copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeededV1(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettingsV1(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettingsV1(parent) + if err != nil { + return err + } + + if isEmptyCpusetV1(currentCpus) { + if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { + return err + } + } + if isEmptyCpusetV1(currentMems) { + if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { + return err + } + } + return nil +} + +func getCpusetSubsystemSettingsV1(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +func isEmptyCpusetV1(str string) bool { + return str == "" || str == "\n" +} + +func GetCgroupPathHelperV1(subsystem, cgroup string) (string, error) { + mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + result := filepath.Join(mnt, relCgroup) + return result, nil +} diff --git a/client/lib/cgutil/cpuset_manager_linux_test.go b/client/lib/cgutil/cpuset_manager_v1_test.go similarity index 80% rename from client/lib/cgutil/cpuset_manager_linux_test.go rename to client/lib/cgutil/cpuset_manager_v1_test.go index e2bb00679c36..3ada7b551a44 100644 --- a/client/lib/cgutil/cpuset_manager_linux_test.go +++ b/client/lib/cgutil/cpuset_manager_v1_test.go @@ -1,51 +1,48 @@ +//go:build linux + package cgutil import ( "io/ioutil" "path/filepath" - "runtime" - "syscall" "testing" + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/mock" "github.com/opencontainers/runc/libcontainer/cgroups" - - "github.com/hashicorp/nomad/helper/uuid" - "github.com/stretchr/testify/require" - - "github.com/hashicorp/nomad/helper/testlog" ) -func tmpCpusetManager(t *testing.T) (manager *cpusetManager, cleanup func()) { - if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { - t.Skip("Test only available running as root on linux") - } +func tmpCpusetManagerV1(t *testing.T) (manager *cpusetManagerV1, cleanup func()) { mount, err := FindCgroupMountpointDir() if err != nil || mount == "" { t.Skipf("Failed to find cgroup mount: %v %v", mount, err) } parent := "/gotest-" + uuid.Short() - require.NoError(t, cpusetEnsureParent(parent)) + require.NoError(t, cpusetEnsureParentV1(parent)) - manager = &cpusetManager{ + manager = &cpusetManagerV1{ cgroupParent: parent, cgroupInfo: map[string]allocTaskCgroupInfo{}, logger: testlog.HCLogger(t), } - parentPath, err := getCgroupPathHelper("cpuset", parent) + parentPath, err := GetCgroupPathHelperV1("cpuset", parent) require.NoError(t, err) return manager, func() { require.NoError(t, cgroups.RemovePaths(map[string]string{"cpuset": parentPath})) } } -func TestCpusetManager_Init(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) +func TestCpusetManager_V1_Init(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) + require.NoError(t, manager.Init(nil)) require.DirExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName)) require.FileExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName, "cpuset.cpus")) @@ -57,10 +54,12 @@ func TestCpusetManager_Init(t *testing.T) { require.DirExists(t, filepath.Join(manager.cgroupParentPath, ReservedCpusetCgroupName)) } -func TestCpusetManager_AddAlloc_single(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) +func TestCpusetManager_V1_AddAlloc_single(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) + require.NoError(t, manager.Init(nil)) alloc := mock.Alloc() // reserve just one core (the 0th core, which probably exists) @@ -104,26 +103,19 @@ func TestCpusetManager_AddAlloc_single(t *testing.T) { require.Exactly(t, alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores, taskCpus.ToSlice()) } -func TestCpusetManager_AddAlloc_subset(t *testing.T) { - t.Skip("todo: add test for #11933") -} +func TestCpusetManager_V1_RemoveAlloc(t *testing.T) { + testutil.CgroupsCompatibleV1(t) -func TestCpusetManager_AddAlloc_all(t *testing.T) { - // cgroupsv2 changes behavior of writing empty cpuset.cpu, which is what - // happens to the /shared group when one or more allocs consume all available - // cores. - t.Skip("todo: add test for #11933") -} - -func TestCpusetManager_RemoveAlloc(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) - - // this case tests adding 2 allocs, reconciling then removing 1 alloc - // it requires the system to have atleast 2 cpu cores (one for each alloc) - if manager.parentCpuset.Size() < 2 { - t.Skip("test requires atleast 2 cpu cores") + require.NoError(t, manager.Init(nil)) + + // This case tests adding 2 allocs, reconciling then removing 1 alloc + // it requires the system to have at least 3 cpu cores (one for each alloc), + // BUT plus another one because writing an empty cpuset causes the cgroup to + // inherit the parent. + if manager.parentCpuset.Size() < 3 { + t.Skip("test requires at least 3 cpu cores") } alloc1 := mock.Alloc() diff --git a/client/lib/cgutil/cpuset_manager_v2.go b/client/lib/cgutil/cpuset_manager_v2.go new file mode 100644 index 000000000000..00162ca523a5 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_v2.go @@ -0,0 +1,332 @@ +//go:build linux + +package cgutil + +import ( + "context" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/lib/cpuset" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + // CreationPID is a special PID in libcontainer used to denote a cgroup + // should be created, but with no process added. + // + // https://github.com/opencontainers/runc/blob/v1.0.3/libcontainer/cgroups/utils.go#L372 + CreationPID = -1 + + // DefaultCgroupParentV2 is the name of Nomad's default parent cgroup, under which + // all other cgroups are managed. This can be changed with client configuration + // in case for e.g. Nomad tasks should be further constrained by an externally + // configured systemd cgroup. + DefaultCgroupParentV2 = "nomad.slice" + + // rootless is (for now) always false; Nomad clients require root, so we + // assume to not need to do the extra plumbing for rootless cgroups. + rootless = false +) + +// nothing is used for treating a map like a set with no values +type nothing struct{} + +// null represents nothing +var null = nothing{} + +type cpusetManagerV2 struct { + logger hclog.Logger + + parent string // relative to cgroup root (e.g. "nomad.slice") + parentAbs string // absolute path (e.g. "/sys/fs/cgroup/nomad.slice") + initial cpuset.CPUSet // set of initial cores (never changes) + + lock sync.Mutex // hold this when managing pool / sharing / isolating + pool cpuset.CPUSet // pool of cores being shared among all tasks + sharing map[identity]nothing // sharing tasks using cores only from the pool + isolating map[identity]cpuset.CPUSet // isolating tasks using cores from the pool + reserved cores +} + +func NewCpusetManagerV2(parent string, logger hclog.Logger) CpusetManager { + cgroupParent := getParentV2(parent) + return &cpusetManagerV2{ + parent: cgroupParent, + parentAbs: filepath.Join(CgroupRoot, cgroupParent), + logger: logger, + sharing: make(map[identity]nothing), + isolating: make(map[identity]cpuset.CPUSet), + } +} + +func (c *cpusetManagerV2) Init(cores []uint16) error { + c.logger.Debug("initializing with", "cores", cores) + if err := c.ensureParent(); err != nil { + c.logger.Error("failed to init cpuset manager", "err", err) + return err + } + c.initial = cpuset.New(cores...) + return nil +} + +func (c *cpusetManagerV2) AddAlloc(alloc *structs.Allocation) { + if alloc == nil || alloc.AllocatedResources == nil { + return + } + c.logger.Trace("add allocation", "name", alloc.Name, "id", alloc.ID) + + // grab write lock while we recompute and apply changes + c.lock.Lock() + defer c.lock.Unlock() + + // first update our tracking of isolating and sharing tasks + for task, resources := range alloc.AllocatedResources.Tasks { + id := makeID(alloc.ID, task) + if len(resources.Cpu.ReservedCores) > 0 { + c.isolating[id] = cpuset.New(resources.Cpu.ReservedCores...) + } else { + c.sharing[id] = null + } + } + + // recompute the available sharable cpu cores + c.recalculate() + + // now write out the entire cgroups space + c.reconcile() + + // no need to cleanup on adds, we did not remove a task +} + +func (c *cpusetManagerV2) RemoveAlloc(allocID string) { + c.logger.Trace("remove allocation", "id", allocID) + + // grab write lock while we recompute and apply changes. + c.lock.Lock() + defer c.lock.Unlock() + + // remove tasks of allocID from the sharing set + for id := range c.sharing { + if strings.HasPrefix(string(id), allocID) { + delete(c.sharing, id) + } + } + + // remove tasks of allocID from the isolating set + for id := range c.isolating { + if strings.HasPrefix(string(id), allocID) { + delete(c.isolating, id) + } + } + + // recompute available sharable cpu cores + c.recalculate() + + // now write out the entire cgroups space + c.reconcile() + + // now remove any tasks no longer running + c.cleanup() +} + +func (c *cpusetManagerV2) CgroupPathFor(allocID, task string) CgroupPathGetter { + // The CgroupPathFor implementation must block until cgroup for allocID.task + // exists [and can accept a PID]. + return func(ctx context.Context) (string, error) { + ticks, cancel := helper.NewSafeTimer(100 * time.Millisecond) + defer cancel() + + for { + path := c.pathOf(makeID(allocID, task)) + mgr, err := fs2.NewManager(nil, path, rootless) + if err != nil { + return "", err + } + + if mgr.Exists() { + return path, nil + } + + select { + case <-ctx.Done(): + return "", ctx.Err() + case <-ticks.C: + continue + } + } + } +} + +// recalculate the number of cores sharable by non-isolating tasks (and isolating tasks) +// +// must be called while holding c.lock +func (c *cpusetManagerV2) recalculate() { + remaining := c.initial.Copy() + for _, set := range c.isolating { + remaining = remaining.Difference(set) + } + c.pool = remaining +} + +// reconcile will actually write the cpuset values for all tracked tasks. +// +// must be called while holding c.lock +func (c *cpusetManagerV2) reconcile() { + for id := range c.sharing { + c.write(id, c.pool) + } + + for id, set := range c.isolating { + c.write(id, c.pool.Union(set)) + } +} + +// cleanup will remove any cgroups for allocations no longer being tracked +// +// must be called while holding c.lock +func (c *cpusetManagerV2) cleanup() { + // create a map to lookup ids we know about + size := len(c.sharing) + len(c.isolating) + ids := make(map[identity]nothing, size) + for id := range c.sharing { + ids[id] = null + } + for id := range c.isolating { + ids[id] = null + } + + if err := filepath.WalkDir(c.parentAbs, func(path string, entry os.DirEntry, err error) error { + // a cgroup is a directory + if !entry.IsDir() { + return nil + } + + dir := filepath.Dir(path) + base := filepath.Base(path) + + // only manage scopes directly under nomad.slice + if dir != c.parentAbs || !strings.HasSuffix(base, ".scope") { + return nil + } + + // only remove the scope if we do not track it + id := identity(strings.TrimSuffix(base, ".scope")) + _, exists := ids[id] + if !exists { + c.remove(path) + } + + return nil + }); err != nil { + c.logger.Error("failed to cleanup cgroup", "err", err) + } +} + +//pathOf returns the absolute path to a task with identity id. +func (c *cpusetManagerV2) pathOf(id identity) string { + return filepath.Join(c.parentAbs, makeScope(id)) +} + +// remove does the actual fs delete of the cgroup +// +// We avoid removing a cgroup if it still contains a PID, as the cpuset manager +// may be initially empty on a Nomad client restart. +func (c *cpusetManagerV2) remove(path string) { + mgr, err := fs2.NewManager(nil, path, rootless) + if err != nil { + c.logger.Warn("failed to create manager", "path", path, "err", err) + return + } + + // get the list of pids managed by this scope (should be 0 or 1) + pids, _ := mgr.GetPids() + + // do not destroy the scope if a PID is still present + // this is a normal condition when an agent restarts with running tasks + // and the v2 manager is still rebuilding its tracked tasks + if len(pids) > 0 { + return + } + + // remove the cgroup + if err3 := mgr.Destroy(); err3 != nil { + c.logger.Warn("failed to cleanup cgroup", "path", path, "err", err) + return + } +} + +// write does the actual write of cpuset set for cgroup id +func (c *cpusetManagerV2) write(id identity, set cpuset.CPUSet) { + path := c.pathOf(id) + + // make a manager for the cgroup + m, err := fs2.NewManager(nil, path, rootless) + if err != nil { + c.logger.Error("failed to manage cgroup", "path", path, "err", err) + } + + // create the cgroup + if err = m.Apply(CreationPID); err != nil { + c.logger.Error("failed to apply cgroup", "path", path, "err", err) + } + + // set the cpuset value for the cgroup + if err = m.Set(&configs.Resources{ + CpusetCpus: set.String(), + }); err != nil { + c.logger.Error("failed to set cgroup", "path", path, "err", err) + } +} + +// ensureParentCgroup will create parent cgroup for the manager if it does not +// exist yet. No PIDs are added to any cgroup yet. +func (c *cpusetManagerV2) ensureParent() error { + mgr, err := fs2.NewManager(nil, c.parentAbs, rootless) + if err != nil { + return err + } + + if err = mgr.Apply(CreationPID); err != nil { + return err + } + + c.logger.Trace("establish cgroup hierarchy", "parent", c.parent) + return nil +} + +// fromRoot returns the joined filepath of group on the CgroupRoot +func fromRoot(group string) string { + return filepath.Join(CgroupRoot, group) +} + +// getCPUsFromCgroupV2 retrieves the effective cpuset for the group, which must +// be directly under the cgroup root (i.e. the parent, like nomad.slice). +func getCPUsFromCgroupV2(group string) ([]uint16, error) { + path := fromRoot(group) + effective, err := cgroups.ReadFile(path, "cpuset.cpus.effective") + if err != nil { + return nil, err + } + set, err := cpuset.Parse(effective) + if err != nil { + return nil, err + } + return set.ToSlice(), nil +} + +// getParentV2 returns parent if set, otherwise the default name of Nomad's +// parent cgroup (i.e. nomad.slice). +func getParentV2(parent string) string { + if parent == "" { + return DefaultCgroupParentV2 + } + return parent +} diff --git a/client/lib/cgutil/cpuset_manager_v2_test.go b/client/lib/cgutil/cpuset_manager_v2_test.go new file mode 100644 index 000000000000..e1d658c82589 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_v2_test.go @@ -0,0 +1,90 @@ +//go:build linux + +package cgutil + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/lib/cpuset" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/stretchr/testify/require" +) + +// Note: these tests need to run on GitHub Actions runners with only 2 cores. +// It is not possible to write more cores to a cpuset than are actually available, +// so make sure tests account for that by setting systemCores as the full set of +// usable cores. +var systemCores = []uint16{0, 1} + +func TestCpusetManager_V2_AddAlloc(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + + logger := testlog.HCLogger(t) + parent := uuid.Short() + ".scope" + create(t, parent) + cleanup(t, parent) + + // setup the cpuset manager + manager := NewCpusetManagerV2(parent, logger) + require.NoError(t, manager.Init(systemCores)) + + // add our first alloc, isolating 1 core + t.Run("first", func(t *testing.T) { + alloc := mock.Alloc() + alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(0).ToSlice() + manager.AddAlloc(alloc) + cpusetIs(t, "0-1", parent, alloc.ID, "web") + }) + + // add second alloc, isolating 1 core + t.Run("second", func(t *testing.T) { + alloc := mock.Alloc() + alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(1).ToSlice() + manager.AddAlloc(alloc) + cpusetIs(t, "1", parent, alloc.ID, "web") + }) + + // note that the scheduler, not the cpuset manager, is what prevents over-subscription + // and as such no logic exists here to prevent that +} + +func cpusetIs(t *testing.T, exp, parent, allocID, task string) { + scope := makeScope(makeID(allocID, task)) + value, err := cgroups.ReadFile(filepath.Join(CgroupRoot, parent, scope), "cpuset.cpus") + require.NoError(t, err) + require.Equal(t, exp, strings.TrimSpace(value)) +} + +func TestCpusetManager_V2_RemoveAlloc(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + + logger := testlog.HCLogger(t) + parent := uuid.Short() + ".scope" + create(t, parent) + cleanup(t, parent) + + // setup the cpuset manager + manager := NewCpusetManagerV2(parent, logger) + require.NoError(t, manager.Init(systemCores)) + + // alloc1 gets core 0 + alloc1 := mock.Alloc() + alloc1.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(0).ToSlice() + manager.AddAlloc(alloc1) + + // alloc2 gets core 1 + alloc2 := mock.Alloc() + alloc2.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(1).ToSlice() + manager.AddAlloc(alloc2) + cpusetIs(t, "1", parent, alloc2.ID, "web") + + // with alloc1 gone, alloc2 gets the now shared core + manager.RemoveAlloc(alloc1.ID) + cpusetIs(t, "0-1", parent, alloc2.ID, "web") +} diff --git a/client/testutil/driver_compatible.go b/client/testutil/driver_compatible.go index ff0ecc747f34..2459f8aa6c86 100644 --- a/client/testutil/driver_compatible.go +++ b/client/testutil/driver_compatible.go @@ -5,18 +5,18 @@ import ( "runtime" "syscall" "testing" - - "github.com/hashicorp/nomad/client/lib/cgutil" ) -// RequireRoot skips tests unless running on a Unix as root. +// RequireRoot skips tests unless: +// - running as root func RequireRoot(t *testing.T) { if syscall.Geteuid() != 0 { - t.Skip("Must run as root on Unix") + t.Skip("Test requires root") } } -// RequireConsul skips tests unless a Consul binary is available on $PATH. +// RequireConsul skips tests unless: +// - "consul" executable is detected on $PATH func RequireConsul(t *testing.T) { _, err := exec.Command("consul", "version").CombinedOutput() if err != nil { @@ -24,7 +24,8 @@ func RequireConsul(t *testing.T) { } } -// RequireVault skips tests unless a Vault binary is available on $PATH. +// RequireVault skips tests unless: +// - "vault" executable is detected on $PATH func RequireVault(t *testing.T) { _, err := exec.Command("vault", "version").CombinedOutput() if err != nil { @@ -32,19 +33,41 @@ func RequireVault(t *testing.T) { } } +// RequireLinux skips tests unless: +// - running on Linux +func RequireLinux(t *testing.T) { + if runtime.GOOS != "linux" { + t.Skip("Test requires Linux") + } +} + +// ExecCompatible skips tests unless: +// - running as root +// - running on Linux func ExecCompatible(t *testing.T) { if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { - t.Skip("Test only available running as root on linux") + t.Skip("Test requires root on Linux") } - CgroupCompatible(t) } +// JavaCompatible skips tests unless: +// - "java" executable is detected on $PATH +// - running as root +// - running on Linux func JavaCompatible(t *testing.T) { - if runtime.GOOS == "linux" && syscall.Geteuid() != 0 { - t.Skip("Test only available when running as root on linux") + _, err := exec.Command("java", "-version").CombinedOutput() + if err != nil { + t.Skipf("Test requires Java: %v", err) + } + + if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { + t.Skip("Test requires root on Linux") } } +// QemuCompatible skips tests unless: +// - "qemu-system-x86_64" executable is detected on $PATH (!windows) +// - "qemu-img" executable is detected on on $PATH (windows) func QemuCompatible(t *testing.T) { // Check if qemu exists bin := "qemu-system-x86_64" @@ -53,23 +76,19 @@ func QemuCompatible(t *testing.T) { } _, err := exec.Command(bin, "--version").CombinedOutput() if err != nil { - t.Skip("Must have Qemu installed for Qemu specific tests to run") - } -} - -func CgroupCompatible(t *testing.T) { - mount, err := cgutil.FindCgroupMountpointDir() - if err != nil || mount == "" { - t.Skipf("Failed to find cgroup mount: %v %v", mount, err) + t.Skipf("Test requires QEMU (%s)", bin) } } +// MountCompatible skips tests unless: +// - not running as windows +// - running as root func MountCompatible(t *testing.T) { if runtime.GOOS == "windows" { - t.Skip("Windows does not support mount") + t.Skip("Test requires not using Windows") } if syscall.Geteuid() != 0 { - t.Skip("Must be root to run test") + t.Skip("Test requires root") } } diff --git a/client/testutil/driver_compatible_default.go b/client/testutil/driver_compatible_default.go new file mode 100644 index 000000000000..2f7b3cb8a0ed --- /dev/null +++ b/client/testutil/driver_compatible_default.go @@ -0,0 +1,22 @@ +//go:build !linux + +package testutil + +import ( + "testing" +) + +// CgroupsCompatible returns false on non-Linux operating systems. +func CgroupsCompatible(t *testing.T) { + t.Skipf("Test requires cgroups support on Linux") +} + +// CgroupsCompatibleV1 skips tests on non-Linux operating systems. +func CgroupsCompatibleV1(t *testing.T) { + t.Skipf("Test requires cgroup.v1 support on Linux") +} + +// CgroupsCompatibleV2 skips tests on non-Linux operating systems. +func CgroupsCompatibleV2(t *testing.T) { + t.Skipf("Test requires cgroup.v2 support on Linux") +} diff --git a/client/testutil/driver_compatible_linux.go b/client/testutil/driver_compatible_linux.go new file mode 100644 index 000000000000..84ace6015f74 --- /dev/null +++ b/client/testutil/driver_compatible_linux.go @@ -0,0 +1,56 @@ +//go:build linux + +package testutil + +import ( + "runtime" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +// CgroupsCompatible returns true if either cgroups.v1 or cgroups.v2 is supported. +func CgroupsCompatible(t *testing.T) bool { + return cgroupsCompatibleV1(t) || cgroupsCompatibleV2(t) +} + +// CgroupsCompatibleV1 skips tests unless: +// - cgroup.v1 mount point is detected +func CgroupsCompatibleV1(t *testing.T) { + if !cgroupsCompatibleV1(t) { + t.Skipf("Test requires cgroup.v1 support") + } +} + +func cgroupsCompatibleV1(t *testing.T) bool { + if runtime.GOOS != "linux" { + return false + } + + if cgroupsCompatibleV2(t) { + t.Log("No cgroup.v1 mount point: running in cgroup.v2 mode") + return false + } + mount, err := cgroups.GetCgroupMounts(false) + if err != nil { + t.Logf("Unable to detect cgroup.v1 mount point: %v", err) + return false + } + if len(mount) == 0 { + t.Logf("No cgroup.v1 mount point: empty path") + return false + } + return true +} + +// CgroupsCompatibleV2 skips tests unless: +// - cgroup.v2 unified mode is detected +func CgroupsCompatibleV2(t *testing.T) { + if !cgroupsCompatibleV2(t) { + t.Skip("Test requires cgroup.v2 support") + } +} + +func cgroupsCompatibleV2(t *testing.T) bool { + return cgroups.IsCgroup2UnifiedMode() +} diff --git a/command/agent/config.go b/command/agent/config.go index 7ed47aa73d2e..9c062b1a9d53 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -1763,6 +1763,11 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { if b.BindWildcardDefaultHostNetwork { result.BindWildcardDefaultHostNetwork = true } + + if b.CgroupParent != "" { + result.CgroupParent = b.CgroupParent + } + return &result } diff --git a/drivers/docker/config.go b/drivers/docker/config.go index d45d6f24cab5..6194d60f1e46 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -754,7 +754,9 @@ func (d *Driver) SetConfig(c *base.Config) error { d.coordinator = newDockerCoordinator(coordinatorConfig) - d.reconciler = newReconciler(d) + d.danglingReconciler = newReconciler(d) + + d.cpusetFixer = newCpusetFixer(d) return nil } diff --git a/drivers/docker/config_test.go b/drivers/docker/config_test.go index df237a440202..d5f1acafc959 100644 --- a/drivers/docker/config_test.go +++ b/drivers/docker/config_test.go @@ -704,7 +704,7 @@ func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) { func TestConfig_DriverConfig_AllowRuntimes(t *testing.T) { ci.Parallel(t) - + cases := []struct { name string config string diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index f86879ebc475..d6e904204aef 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -20,6 +20,7 @@ import ( hclog "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" plugin "github.com/hashicorp/go-plugin" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/drivers/docker/docklog" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -122,7 +123,8 @@ type Driver struct { detected bool detectedLock sync.RWMutex - reconciler *containerReconciler + danglingReconciler *containerReconciler + cpusetFixer CpusetFixer } // NewDockerDriver returns a docker implementation of a driver plugin @@ -350,9 +352,14 @@ CREATE: container.ID, "container_state", container.State.String()) } - if containerCfg.HostConfig.CPUSet == "" && cfg.Resources.LinuxResources.CpusetCgroupPath != "" { - if err := setCPUSetCgroup(cfg.Resources.LinuxResources.CpusetCgroupPath, container.State.Pid); err != nil { - return nil, nil, fmt.Errorf("failed to set the cpuset cgroup for container: %v", err) + if !cgutil.UseV2 { + // This does not apply to cgroups.v2, which only allows setting the PID + // into exactly 1 group. For cgroups.v2, we use the cpuset fixer to reconcile + // the cpuset value into the cgroups created by docker in the background. + if containerCfg.HostConfig.CPUSet == "" && cfg.Resources.LinuxResources.CpusetCgroupPath != "" { + if err := setCPUSetCgroup(cfg.Resources.LinuxResources.CpusetCgroupPath, container.State.Pid); err != nil { + return nil, nil, fmt.Errorf("failed to set the cpuset cgroup for container: %v", err) + } } } @@ -841,7 +848,12 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T pidsLimit = driverConfig.PidsLimit } + // Extract the cgroup parent from the nomad cgroup (bypass the need for plugin config) + parent, _ := cgutil.SplitPath(task.Resources.LinuxResources.CpusetCgroupPath) + hostConfig := &docker.HostConfig{ + CgroupParent: parent, + Memory: memory, // hard limit MemoryReservation: memoryReservation, // soft limit diff --git a/drivers/docker/driver_darwin.go b/drivers/docker/driver_darwin.go index b0b1c81dfffc..2db02e4bf6d7 100644 --- a/drivers/docker/driver_darwin.go +++ b/drivers/docker/driver_darwin.go @@ -1,3 +1,5 @@ +//go:build darwin + package docker func setCPUSetCgroup(path string, pid int) error { diff --git a/drivers/docker/driver_darwin_test.go b/drivers/docker/driver_darwin_test.go index 18bfddd6828c..628e89a3e93e 100644 --- a/drivers/docker/driver_darwin_test.go +++ b/drivers/docker/driver_darwin_test.go @@ -1,3 +1,5 @@ +//go:build darwin + package docker import ( diff --git a/drivers/docker/driver_default.go b/drivers/docker/driver_default.go index 8607b899b199..b7bef7173408 100644 --- a/drivers/docker/driver_default.go +++ b/drivers/docker/driver_default.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package docker diff --git a/drivers/docker/driver_linux.go b/drivers/docker/driver_linux.go index b10d3241a670..97cf680b840d 100644 --- a/drivers/docker/driver_linux.go +++ b/drivers/docker/driver_linux.go @@ -1,3 +1,5 @@ +//go:build linux + package docker import ( @@ -7,7 +9,7 @@ import ( ) func setCPUSetCgroup(path string, pid int) error { - // Sometimes the container exists before we can write the + // Sometimes the container exits before we can write the // cgroup resulting in an error which can be ignored. err := cgroups.WriteCgroupProc(path, pid) if err != nil && strings.Contains(err.Error(), "no such process") { diff --git a/drivers/docker/driver_linux_test.go b/drivers/docker/driver_linux_test.go index 006517b28fef..88c83fbba5cb 100644 --- a/drivers/docker/driver_linux_test.go +++ b/drivers/docker/driver_linux_test.go @@ -1,3 +1,5 @@ +//go:build linux + package docker import ( diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 26cbf99161d2..cdffca518ae1 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -1507,9 +1507,7 @@ func TestDockerDriver_Init(t *testing.T) { func TestDockerDriver_CPUSetCPUs(t *testing.T) { ci.Parallel(t) testutil.DockerCompatible(t) - if runtime.GOOS == "windows" { - t.Skip("Windows does not support CPUSetCPUs.") - } + testutil.CgroupsCompatible(t) testCases := []struct { Name string diff --git a/drivers/docker/driver_unix_test.go b/drivers/docker/driver_unix_test.go index fb4fbba254e6..7f019af51614 100644 --- a/drivers/docker/driver_unix_test.go +++ b/drivers/docker/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package docker diff --git a/drivers/docker/driver_windows.go b/drivers/docker/driver_windows.go index 7eda62386f08..f3a4e22ec162 100644 --- a/drivers/docker/driver_windows.go +++ b/drivers/docker/driver_windows.go @@ -1,3 +1,5 @@ +//go:build windows + package docker import docker "github.com/fsouza/go-dockerclient" diff --git a/drivers/docker/driver_windows_test.go b/drivers/docker/driver_windows_test.go index f54b83f20956..765917231862 100644 --- a/drivers/docker/driver_windows_test.go +++ b/drivers/docker/driver_windows_test.go @@ -1,5 +1,4 @@ //go:build windows -// +build windows package docker diff --git a/drivers/docker/fingerprint.go b/drivers/docker/fingerprint.go index be5fd19c0594..f37a169788e1 100644 --- a/drivers/docker/fingerprint.go +++ b/drivers/docker/fingerprint.go @@ -13,9 +13,10 @@ import ( ) func (d *Driver) Fingerprint(ctx context.Context) (<-chan *drivers.Fingerprint, error) { - // start reconciler when we start fingerprinting - // this is the only method called when driver is launched properly - d.reconciler.Start() + // Start docker reconcilers when we start fingerprinting, a workaround for + // task drivers not having a kind of post-setup hook. + d.danglingReconciler.Start() + d.cpusetFixer.Start() ch := make(chan *drivers.Fingerprint) go d.handleFingerprint(ctx, ch) diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 7913796324a7..d4fc19601bd8 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -244,6 +244,12 @@ func (h *taskHandle) run() { } else if container.State.OOMKilled { oom = true werr = fmt.Errorf("OOM Killed") + } else if container.State.ExitCode == 137 { + // With cgroups.v2 it seems the cgroup OOM killer is not observed by docker + // container status. So just fudge the connection for now. + // [Mon Mar 21 19:48:21 2022] Memory cgroup out of memory: Killed process 92768 (sh) [...] + oom = true + werr = fmt.Errorf("OOM Killed (137)") } // Shutdown stats collection diff --git a/drivers/docker/reconcile_cpuset.go b/drivers/docker/reconcile_cpuset.go new file mode 100644 index 000000000000..4eab59ba00d9 --- /dev/null +++ b/drivers/docker/reconcile_cpuset.go @@ -0,0 +1,125 @@ +//go:build linux + +package docker + +import ( + "context" + "fmt" + "path/filepath" + "sync" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/lib/cgutil" + "github.com/hashicorp/nomad/helper" +) + +const ( + cpusetReconcileInterval = 1 * time.Second +) + +type CpusetFixer interface { + Start() +} + +// cpusetFixer adjusts the cpuset.cpus cgroup value to the assigned value by Nomad. +// +// Due to Docker not allowing the configuration of the full cgroup path, we must +// manually fix the cpuset values for all docker containers continuously, as the +// values will change as tasks of any driver using reserved cores are started and +// stopped, changing the size of the remaining shared cpu pool. +// +// The exec/java, podman, and containerd runtimes let you specify the cgroup path, +// making use of the cgroup Nomad creates and manages on behalf of the task. +type cpusetFixer struct { + ctx context.Context + logger hclog.Logger + interval time.Duration + once sync.Once + tasks func() map[coordinate]struct{} +} + +func newCpusetFixer(d *Driver) CpusetFixer { + return &cpusetFixer{ + interval: cpusetReconcileInterval, + ctx: d.ctx, + logger: d.logger, + tasks: d.trackedTasks, + } +} + +// Start will start the background cpuset reconciliation until the cf context is +// cancelled for shutdown. +// +// Only runs if cgroups.v2 is in use. +func (cf *cpusetFixer) Start() { + cf.once.Do(func() { + if cgutil.UseV2 { + go cf.loop() + } + }) +} + +func (cf *cpusetFixer) loop() { + timer, cancel := helper.NewSafeTimer(0) + defer cancel() + + for { + select { + case <-cf.ctx.Done(): + return + case <-timer.C: + timer.Stop() + cf.apply() + timer.Reset(cf.interval) + } + } +} + +func (cf *cpusetFixer) apply() { + coordinates := cf.tasks() + for c := range coordinates { + cf.fix(c) + } +} + +func (cf *cpusetFixer) fix(c coordinate) { + source := c.NomadCgroup() + destination := c.DockerCgroup() + if err := cgutil.CopyCpuset(source, destination); err != nil { + cf.logger.Debug("failed to copy cpuset", "err", err) + } +} + +type coordinate struct { + containerID string + allocID string + task string + path string +} + +func (c coordinate) NomadCgroup() string { + parent, _ := cgutil.SplitPath(c.path) + return filepath.Join(cgutil.CgroupRoot, parent, cgutil.CgroupScope(c.allocID, c.task)) +} + +func (c coordinate) DockerCgroup() string { + parent, _ := cgutil.SplitPath(c.path) + return filepath.Join(cgutil.CgroupRoot, parent, fmt.Sprintf("docker-%s.scope", c.containerID)) +} + +func (d *Driver) trackedTasks() map[coordinate]struct{} { + d.tasks.lock.RLock() + defer d.tasks.lock.RUnlock() + + m := make(map[coordinate]struct{}, len(d.tasks.store)) + for _, h := range d.tasks.store { + m[coordinate{ + containerID: h.containerID, + allocID: h.task.AllocID, + task: h.task.Name, + path: h.task.Resources.LinuxResources.CpusetCgroupPath, + }] = struct{}{} + } + return m +} diff --git a/drivers/docker/reconcile_cpuset_noop.go b/drivers/docker/reconcile_cpuset_noop.go new file mode 100644 index 000000000000..9613a5ad2ee0 --- /dev/null +++ b/drivers/docker/reconcile_cpuset_noop.go @@ -0,0 +1,19 @@ +//go:build !linux + +package docker + +type CpusetFixer interface { + Start() +} + +func newCpusetFixer(*Driver) CpusetFixer { + return new(noop) +} + +type noop struct { + // empty +} + +func (*noop) Start() { + // empty +} diff --git a/drivers/docker/reconcile_cpuset_test.go b/drivers/docker/reconcile_cpuset_test.go new file mode 100644 index 000000000000..07ed71294fa7 --- /dev/null +++ b/drivers/docker/reconcile_cpuset_test.go @@ -0,0 +1,36 @@ +//go:build linux + +package docker + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/stretchr/testify/require" +) + +func TestCoordinate_NomadCgroup(t *testing.T) { + ci.Parallel(t) + + result := (coordinate{ + containerID: "c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1", + allocID: "27ee5321-28d6-22d7-9426-4e1888da8e7d", + task: "redis", + path: "/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope", + }).NomadCgroup() + exp := "/sys/fs/cgroup/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope" + require.Equal(t, exp, result) +} + +func TestCoordinate_DockerCgroup(t *testing.T) { + ci.Parallel(t) + + result := (coordinate{ + containerID: "c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1", + allocID: "27ee5321-28d6-22d7-9426-4e1888da8e7d", + task: "redis", + path: "/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope", + }).DockerCgroup() + exp := "/sys/fs/cgroup/nomad.scope/docker-c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1.scope" + require.Equal(t, exp, result) +} diff --git a/drivers/docker/reconciler.go b/drivers/docker/reconcile_dangling.go similarity index 100% rename from drivers/docker/reconciler.go rename to drivers/docker/reconcile_dangling.go diff --git a/drivers/docker/reconciler_test.go b/drivers/docker/reconcile_dangling_test.go similarity index 100% rename from drivers/docker/reconciler_test.go rename to drivers/docker/reconcile_dangling_test.go diff --git a/drivers/docker/util/stats_posix.go b/drivers/docker/util/stats_posix.go index ba1167f344b2..3a9f03f8f539 100644 --- a/drivers/docker/util/stats_posix.go +++ b/drivers/docker/util/stats_posix.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package util diff --git a/drivers/docker/utils_unix_test.go b/drivers/docker/utils_unix_test.go index e53c72bec8ae..0a61bd545395 100644 --- a/drivers/docker/utils_unix_test.go +++ b/drivers/docker/utils_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package docker diff --git a/drivers/docker/utils_windows_test.go b/drivers/docker/utils_windows_test.go index d433055e9615..72d7e0bbfc85 100644 --- a/drivers/docker/utils_windows_test.go +++ b/drivers/docker/utils_windows_test.go @@ -1,5 +1,4 @@ //go:build windows -// +build windows package docker diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index 422b8b42aab9..882e3ae62f12 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -9,11 +9,10 @@ import ( "sync" "time" - "github.com/hashicorp/nomad/client/lib/cgutil" - "github.com/hashicorp/nomad/drivers/shared/capabilities" - "github.com/hashicorp/consul-template/signals" hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/lib/cgutil" + "github.com/hashicorp/nomad/drivers/shared/capabilities" "github.com/hashicorp/nomad/drivers/shared/eventer" "github.com/hashicorp/nomad/drivers/shared/executor" "github.com/hashicorp/nomad/drivers/shared/resolvconf" diff --git a/drivers/exec/driver_test.go b/drivers/exec/driver_test.go index f5c8d9a2b57f..fed60bea9623 100644 --- a/drivers/exec/driver_test.go +++ b/drivers/exec/driver_test.go @@ -17,6 +17,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" ctestutils "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/executor" "github.com/hashicorp/nomad/helper/pluginutils/hclutils" @@ -31,25 +32,41 @@ import ( "github.com/stretchr/testify/require" ) +const ( + cgroupParent = "testing.slice" +) + func TestMain(m *testing.M) { if !testtask.Run() { os.Exit(m.Run()) } } -var testResources = &drivers.Resources{ - NomadResources: &structs.AllocatedTaskResources{ - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 128, +func testResources(allocID, task string) *drivers.Resources { + if allocID == "" || task == "" { + panic("must be set") + } + + r := &drivers.Resources{ + NomadResources: &structs.AllocatedTaskResources{ + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 128, + }, + Cpu: structs.AllocatedCpuResources{ + CpuShares: 100, + }, }, - Cpu: structs.AllocatedCpuResources{ - CpuShares: 100, + LinuxResources: &drivers.LinuxResources{ + MemoryLimitBytes: 134217728, + CPUShares: 100, }, - }, - LinuxResources: &drivers.LinuxResources{ - MemoryLimitBytes: 134217728, - CPUShares: 100, - }, + } + + if cgutil.UseV2 { + r.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, cgroupParent, cgutil.CgroupScope(allocID, task)) + } + + return r } func TestExecDriver_Fingerprint_NonLinux(t *testing.T) { @@ -100,7 +117,6 @@ func TestExecDriver_Fingerprint(t *testing.T) { func TestExecDriver_StartWait(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -108,34 +124,35 @@ func TestExecDriver_StartWait(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "cat", Args: []string{"/proc/self/cgroup"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Zero(result.ExitCode) - require.NoError(harness.DestroyTask(task.ID, true)) + require.Zero(t, result.ExitCode) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_StartWaitStopKill(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -143,29 +160,31 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/bash", Args: []string{"-c", "echo hi; sleep 600"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { harness.StopTask(task.ID, 2*time.Second, "SIGINT") @@ -173,9 +192,9 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { select { case result := <-ch: - require.False(result.Successful()) + require.False(t, result.Successful()) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -191,54 +210,55 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_StartWaitRecover(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) - dctx, dcancel := context.WithCancel(context.Background()) - defer dcancel() + dCtx, dCancel := context.WithCancel(context.Background()) + defer dCancel() - d := NewExecDriver(dctx, testlog.HCLogger(t)) + d := NewExecDriver(dCtx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/sleep", Args: []string{"5"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) ch, err := harness.WaitTask(ctx, handle.Config.ID) - require.NoError(err) + require.NoError(t, err) var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() result := <-ch - require.Error(result.Err) + require.Error(t, result.Err) }() - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) cancel() waitCh := make(chan struct{}) @@ -250,31 +270,30 @@ func TestExecDriver_StartWaitRecover(t *testing.T) { select { case <-waitCh: status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateRunning, status.State) + require.NoError(t, err) + require.Equal(t, drivers.TaskStateRunning, status.State) case <-time.After(1 * time.Second): - require.Fail("timeout waiting for task wait to cancel") + require.Fail(t, "timeout waiting for task wait to cancel") } // Loose task d.(*Driver).tasks.Delete(task.ID) _, err = harness.InspectTask(task.ID) - require.Error(err) + require.Error(t, err) - require.NoError(harness.RecoverTask(handle)) + require.NoError(t, harness.RecoverTask(handle)) status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateRunning, status.State) + require.NoError(t, err) + require.Equal(t, drivers.TaskStateRunning, status.State) - require.NoError(harness.StopTask(task.ID, 0, "")) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.StopTask(task.ID, 0, "")) + require.NoError(t, harness.DestroyTask(task.ID, true)) } // TestExecDriver_NoOrphans asserts that when the main // task dies, the orphans in the PID namespaces are killed by the kernel func TestExecDriver_NoOrphans(t *testing.T) { ci.Parallel(t) - r := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -291,13 +310,19 @@ func TestExecDriver_NoOrphans(t *testing.T) { } var data []byte - r.NoError(basePlug.MsgPackEncode(&data, config)) + require.NoError(t, basePlug.MsgPackEncode(&data, config)) baseConfig := &basePlug.Config{PluginConfig: data} - r.NoError(harness.SetConfig(baseConfig)) + require.NoError(t, harness.SetConfig(baseConfig)) + allocID := uuid.Generate() task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "test", + AllocID: allocID, + ID: uuid.Generate(), + Name: "test", + } + + if cgutil.UseV2 { + task.Resources = testResources(allocID, "test") } cleanup := harness.MkAllocDir(task, true) @@ -307,21 +332,21 @@ func TestExecDriver_NoOrphans(t *testing.T) { taskConfig["command"] = "/bin/sh" // print the child PID in the task PID namespace, then sleep for 5 seconds to give us a chance to examine processes taskConfig["args"] = []string{"-c", fmt.Sprintf(`sleep 3600 & sleep 20`)} - r.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) handle, _, err := harness.StartTask(task) - r.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) waitCh, err := harness.WaitTask(context.Background(), handle.Config.ID) - r.NoError(err) + require.NoError(t, err) - r.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) var childPids []int taskState := TaskState{} testutil.WaitForResult(func() (bool, error) { - r.NoError(handle.GetDriverState(&taskState)) + require.NoError(t, handle.GetDriverState(&taskState)) if taskState.Pid == 0 { return false, fmt.Errorf("task PID is zero") } @@ -343,14 +368,14 @@ func TestExecDriver_NoOrphans(t *testing.T) { } return true, nil }, func(err error) { - r.NoError(err) + require.NoError(t, err) }) select { case result := <-waitCh: - r.True(result.Successful(), "command failed: %#v", result) + require.True(t, result.Successful(), "command failed: %#v", result) case <-time.After(30 * time.Second): - r.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // isProcessRunning returns an error if process is not running @@ -369,7 +394,7 @@ func TestExecDriver_NoOrphans(t *testing.T) { } // task should be dead - r.Error(isProcessRunning(taskState.Pid)) + require.Error(t, isProcessRunning(taskState.Pid)) // all children should eventually be killed by OS testutil.WaitForResult(func() (bool, error) { @@ -384,13 +409,12 @@ func TestExecDriver_NoOrphans(t *testing.T) { } return true, nil }, func(err error) { - r.NoError(err) + require.NoError(t, err) }) } func TestExecDriver_Stats(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) dctx, dcancel := context.WithCancel(context.Background()) @@ -398,45 +422,47 @@ func TestExecDriver_Stats(t *testing.T) { d := NewExecDriver(dctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/sleep", Args: []string{"5"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) ctx, cancel := context.WithCancel(context.Background()) defer cancel() statsCh, err := harness.TaskStats(ctx, task.ID, time.Second*10) - require.NoError(err) + require.NoError(t, err) select { case stats := <-statsCh: - require.NotEmpty(stats.ResourceUsage.MemoryStats.Measured) - require.NotZero(stats.Timestamp) - require.WithinDuration(time.Now(), time.Unix(0, stats.Timestamp), time.Second) + require.NotEmpty(t, stats.ResourceUsage.MemoryStats.Measured) + require.NotZero(t, stats.Timestamp) + require.WithinDuration(t, time.Now(), time.Unix(0, stats.Timestamp), time.Second) case <-time.After(time.Second): - require.Fail("timeout receiving from channel") + require.Fail(t, "timeout receiving from channel") } - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -444,10 +470,12 @@ func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "test"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -461,34 +489,33 @@ func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { fmt.Sprintf(`sleep 1; echo -n %s > /alloc/%s`, string(exp), file), }, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) // Task should terminate quickly waitCh, err := harness.WaitTask(context.Background(), task.ID) - require.NoError(err) + require.NoError(t, err) select { case res := <-waitCh: - require.True(res.Successful(), "task should have exited successfully: %v", res) + require.True(t, res.Successful(), "task should have exited successfully: %v", res) case <-time.After(time.Duration(testutil.TestMultiplier()*5) * time.Second): - require.Fail("timeout waiting for task") + require.Fail(t, "timeout waiting for task") } // Check that data was written to the shared alloc directory. outputFile := filepath.Join(task.TaskDir().SharedAllocDir, file) act, err := ioutil.ReadFile(outputFile) - require.NoError(err) - require.Exactly(exp, act) + require.NoError(t, err) + require.Exactly(t, exp, act) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_User(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -496,11 +523,13 @@ func TestExecDriver_User(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", User: "alice", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -509,11 +538,11 @@ func TestExecDriver_User(t *testing.T) { Command: "/bin/sleep", Args: []string{"100"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.Error(err) - require.Nil(handle) + require.Error(t, err) + require.Nil(t, handle) msg := "user alice" if !strings.Contains(err.Error(), msg) { @@ -525,18 +554,20 @@ func TestExecDriver_User(t *testing.T) { // executes commands inside the container. func TestExecDriver_HandlerExec(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) + ctestutils.CgroupsCompatibleV1(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -545,11 +576,11 @@ func TestExecDriver_HandlerExec(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) // Exec a command that should work and dump the environment // TODO: enable section when exec env is fully loaded @@ -575,8 +606,8 @@ func TestExecDriver_HandlerExec(t *testing.T) { // Assert cgroup membership res, err := harness.ExecTask(task.ID, []string{"/bin/cat", "/proc/self/cgroup"}, time.Second) - require.NoError(err) - require.True(res.ExitResult.Successful()) + require.NoError(t, err) + require.True(t, res.ExitResult.Successful()) found := false for _, line := range strings.Split(string(res.Stdout), "\n") { // Every cgroup entry should be /nomad/$ALLOC_ID @@ -594,41 +625,41 @@ func TestExecDriver_HandlerExec(t *testing.T) { } found = true } - require.True(found, "exec'd command isn't in the task's cgroup") + require.True(t, found, "exec'd command isn't in the task's cgroup") // Exec a command that should fail res, err = harness.ExecTask(task.ID, []string{"/usr/bin/stat", "lkjhdsaflkjshowaisxmcvnlia"}, time.Second) - require.NoError(err) - require.False(res.ExitResult.Successful()) + require.NoError(t, err) + require.False(t, res.ExitResult.Successful()) if expected := "No such file or directory"; !bytes.Contains(res.Stdout, []byte(expected)) { t.Fatalf("expected output to contain %q but found: %q", expected, res.Stdout) } - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_DevicesAndMounts(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) tmpDir, err := ioutil.TempDir("", "exec_binds_mounts") - require.NoError(err) + require.NoError(t, err) defer os.RemoveAll(tmpDir) err = ioutil.WriteFile(filepath.Join(tmpDir, "testfile"), []byte("from-host"), 600) - require.NoError(err) + require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) defer cancel() d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ ID: uuid.Generate(), Name: "test", User: "root", // need permission to read mounts paths - Resources: testResources, + Resources: testResources(allocID, "test"), StdoutPath: filepath.Join(tmpDir, "task-stdout"), StderrPath: filepath.Join(tmpDir, "task-stderr"), Devices: []*drivers.DeviceConfig{ @@ -652,8 +683,8 @@ func TestExecDriver_DevicesAndMounts(t *testing.T) { }, } - require.NoError(ioutil.WriteFile(task.StdoutPath, []byte{}, 660)) - require.NoError(ioutil.WriteFile(task.StderrPath, []byte{}, 660)) + require.NoError(t, ioutil.WriteFile(task.StdoutPath, []byte{}, 660)) + require.NoError(t, ioutil.WriteFile(task.StderrPath, []byte{}, 660)) tc := &TaskConfig{ Command: "/bin/bash", @@ -669,38 +700,38 @@ touch /tmp/task-path-ro/testfile-from-ro && echo from-exec > /tmp/task-path-ro/ exit 0 `}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) stdout, err := ioutil.ReadFile(task.StdoutPath) - require.NoError(err) - require.Equal(`mounted device /inserted-random: 1:8 + require.NoError(t, err) + require.Equal(t, `mounted device /inserted-random: 1:8 reading from ro path: from-host reading from rw path: from-host overwriting file in rw succeeded writing new file in rw succeeded`, strings.TrimSpace(string(stdout))) stderr, err := ioutil.ReadFile(task.StderrPath) - require.NoError(err) - require.Equal(`touch: cannot touch '/tmp/task-path-ro/testfile': Read-only file system + require.NoError(t, err) + require.Equal(t, `touch: cannot touch '/tmp/task-path-ro/testfile': Read-only file system touch: cannot touch '/tmp/task-path-ro/testfile-from-ro': Read-only file system`, strings.TrimSpace(string(stderr))) // testing exit code last so we can inspect output first - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) fromRWContent, err := ioutil.ReadFile(filepath.Join(tmpDir, "testfile-from-rw")) - require.NoError(err) - require.Equal("from-exec", strings.TrimSpace(string(fromRWContent))) + require.NoError(t, err) + require.Equal(t, "from-exec", strings.TrimSpace(string(fromRWContent))) } func TestConfig_ParseAllHCL(t *testing.T) { @@ -719,13 +750,11 @@ config { var tc *TaskConfig hclutils.NewConfigParser(taskConfigSpec).ParseHCL(t, cfgStr, &tc) - require.EqualValues(t, expected, tc) } func TestExecDriver_NoPivotRoot(t *testing.T) { ci.Parallel(t) - r := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -741,14 +770,16 @@ func TestExecDriver_NoPivotRoot(t *testing.T) { } var data []byte - r.NoError(basePlug.MsgPackEncode(&data, config)) + require.NoError(t, basePlug.MsgPackEncode(&data, config)) bconfig := &basePlug.Config{PluginConfig: data} - r.NoError(harness.SetConfig(bconfig)) + require.NoError(t, harness.SetConfig(bconfig)) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -757,11 +788,11 @@ func TestExecDriver_NoPivotRoot(t *testing.T) { Command: "/bin/sleep", Args: []string{"100"}, } - r.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - r.NoError(err) - r.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) } func TestDriver_Config_validate(t *testing.T) { diff --git a/drivers/exec/driver_unix_test.go b/drivers/exec/driver_unix_test.go index 6d62902f8a0c..ce765835dc3c 100644 --- a/drivers/exec/driver_unix_test.go +++ b/drivers/exec/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package exec @@ -11,6 +10,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" ctestutils "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/capabilities" "github.com/hashicorp/nomad/drivers/shared/executor" @@ -26,7 +26,6 @@ import ( func TestExecDriver_StartWaitStop(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -34,29 +33,31 @@ func TestExecDriver_StartWaitStop(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } taskConfig := map[string]interface{}{ "command": "/bin/sleep", "args": []string{"600"}, } - require.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) defer harness.DestroyTask(task.ID, true) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { harness.StopTask(task.ID, 2*time.Second, "SIGKILL") @@ -64,9 +65,9 @@ func TestExecDriver_StartWaitStop(t *testing.T) { select { case result := <-ch: - require.Equal(int(unix.SIGKILL), result.Signal) + require.Equal(t, int(unix.SIGKILL), result.Signal) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -82,13 +83,12 @@ func TestExecDriver_StartWaitStop(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) } func TestExec_ExecTaskStreaming(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -102,6 +102,12 @@ func TestExec_ExecTaskStreaming(t *testing.T) { Name: "sleep", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = testResources(allocID, "sleep") + } + cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -109,14 +115,13 @@ func TestExec_ExecTaskStreaming(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.ExecTaskStreamingConformanceTests(t, harness, task.ID) - } // Tests that a given DNSConfig properly configures dns @@ -124,7 +129,6 @@ func TestExec_dnsConfig(t *testing.T) { ci.Parallel(t) ctestutils.RequireRoot(t) ctestutils.ExecCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -163,6 +167,11 @@ func TestExec_dnsConfig(t *testing.T) { DNS: c.cfg, } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.Resources = testResources(allocID, "sleep") + } + cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -170,10 +179,10 @@ func TestExec_dnsConfig(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.TestTaskDNSConfig(t, harness, task.ID, c.cfg) @@ -189,6 +198,12 @@ func TestExecDriver_Capabilities(t *testing.T) { Name: "sleep", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = testResources(allocID, "sleep") + } + for _, tc := range []struct { Name string CapAdd []string diff --git a/drivers/java/driver_test.go b/drivers/java/driver_test.go index 63cb4c12043d..cc7a74378053 100644 --- a/drivers/java/driver_test.go +++ b/drivers/java/driver_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "github.com/hashicorp/nomad/client/lib/cgutil" "io" "io/ioutil" "os" @@ -12,14 +13,13 @@ import ( "time" "github.com/hashicorp/nomad/ci" - dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" - ctestutil "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/pluginutils/hclutils" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" + dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -60,7 +60,6 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -72,6 +71,7 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { Args: []string{"1"}, JvmOpts: []string{"-Xmx64m", "-Xms32m"}, } + task := basicTask(t, "demo-app", tc) cleanup := harness.MkAllocDir(task, true) @@ -80,28 +80,27 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { copyFile("./test-resources/demoapp.jar", filepath.Join(task.TaskDir().Dir, "demoapp.jar"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Nil(result.Err) + require.Nil(t, result.Err) - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) // Get the stdout of the process and assert that it's not empty stdout, err := ioutil.ReadFile(filepath.Join(task.TaskDir().LogDir, "demo-app.stdout.0")) - require.NoError(err) - require.Contains(string(stdout), "Hello") + require.NoError(t, err) + require.Contains(t, string(stdout), "Hello") - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -121,12 +120,12 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { copyFile("./test-resources/demoapp.jar", filepath.Join(task.TaskDir().Dir, "demoapp.jar"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { time.Sleep(10 * time.Millisecond) @@ -135,9 +134,9 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { select { case result := <-ch: - require.False(result.Successful()) + require.False(t, result.Successful()) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -153,17 +152,16 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaDriver_Class_Start_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -182,21 +180,21 @@ func TestJavaDriver_Class_Start_Wait(t *testing.T) { copyFile("./test-resources/Hello.class", filepath.Join(task.TaskDir().Dir, "Hello.class"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Nil(result.Err) + require.Nil(t, result.Err) - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) // Get the stdout of the process and assert that it's not empty stdout, err := ioutil.ReadFile(filepath.Join(task.TaskDir().LogDir, "demo-app.stdout.0")) - require.NoError(err) - require.Contains(string(stdout), "Hello") + require.NoError(t, err) + require.Contains(t, string(stdout), "Hello") - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaCmdArgs(t *testing.T) { @@ -254,7 +252,6 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -274,7 +271,7 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { copyFile("./test-resources/Hello.class", filepath.Join(task.TaskDir().Dir, "Hello.class"), t) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.ExecTaskStreamingConformanceTests(t, harness, task.ID) @@ -283,9 +280,11 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { func basicTask(t *testing.T, name string, taskConfig *TaskConfig) *drivers.TaskConfig { t.Helper() + allocID := uuid.Generate() task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: name, + AllocID: allocID, + ID: uuid.Generate(), + Name: name, Resources: &drivers.Resources{ NomadResources: &structs.AllocatedTaskResources{ Memory: structs.AllocatedMemoryResources{ @@ -302,6 +301,10 @@ func basicTask(t *testing.T, name string, taskConfig *TaskConfig) *drivers.TaskC }, } + if cgutil.UseV2 { + task.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, name)) + } + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) return task } diff --git a/drivers/rawexec/driver_test.go b/drivers/rawexec/driver_test.go index 4b76350980fe..bac01470c708 100644 --- a/drivers/rawexec/driver_test.go +++ b/drivers/rawexec/driver_test.go @@ -323,6 +323,8 @@ func TestRawExecDriver_Start_Wait_AllocDir(t *testing.T) { func TestRawExecDriver_Start_Kill_Wait_Cgroup(t *testing.T) { ci.Parallel(t) ctestutil.ExecCompatible(t) + ctestutil.CgroupsCompatibleV1(t) // todo(shoenig) #12348 + require := require.New(t) pidFile := "pid" @@ -414,6 +416,9 @@ func TestRawExecDriver_Start_Kill_Wait_Cgroup(t *testing.T) { func TestRawExecDriver_Exec(t *testing.T) { ci.Parallel(t) + ctestutil.ExecCompatible(t) + ctestutil.CgroupsCompatibleV1(t) // todo(shoenig) #12348 + require := require.New(t) d := newEnabledRawExecDriver(t) diff --git a/drivers/rawexec/driver_unix_test.go b/drivers/rawexec/driver_unix_test.go index bf7e0c4e2c35..895906bc9131 100644 --- a/drivers/rawexec/driver_unix_test.go +++ b/drivers/rawexec/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package rawexec @@ -18,8 +17,11 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" + clienttestutil "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/testtask" "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" basePlug "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/drivers" dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" @@ -30,9 +32,7 @@ import ( func TestRawExecDriver_User(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } + clienttestutil.RequireLinux(t) require := require.New(t) d := newEnabledRawExecDriver(t) @@ -205,13 +205,8 @@ func TestRawExecDriver_StartWaitStop(t *testing.T) { // task processes are cleaned up. func TestRawExecDriver_DestroyKillsAll(t *testing.T) { ci.Parallel(t) - - // This only works reliably with cgroup PID tracking, happens in linux only - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } - - require := require.New(t) + clienttestutil.RequireLinux(t) + clienttestutil.CgroupsCompatibleV1(t) // todo(shoenig): #12348 d := newEnabledRawExecDriver(t) harness := dtestutil.NewDriverHarness(t, d) @@ -222,6 +217,15 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { Name: "test", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = new(drivers.Resources) + task.Resources.NomadResources = new(structs.AllocatedTaskResources) + task.Resources.LinuxResources = new(drivers.LinuxResources) + task.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, "test")) + } + cleanup := harness.MkAllocDir(task, true) defer cleanup() @@ -229,20 +233,20 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { taskConfig["command"] = "/bin/sh" taskConfig["args"] = []string{"-c", fmt.Sprintf(`sleep 3600 & echo "SLEEP_PID=$!"`)} - require.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) select { case result := <-ch: - require.True(result.Successful(), "command failed: %#v", result) + require.True(t, result.Successful(), "command failed: %#v", result) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } sleepPid := 0 @@ -268,7 +272,7 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { sleepPid = pid return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) // isProcessRunning returns an error if process is not running @@ -286,9 +290,9 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { return nil } - require.NoError(isProcessRunning(sleepPid)) + require.NoError(t, isProcessRunning(sleepPid)) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) testutil.WaitForResult(func() (bool, error) { err := isProcessRunning(sleepPid) @@ -302,7 +306,7 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) } @@ -342,9 +346,7 @@ func TestRawExec_ExecTaskStreaming(t *testing.T) { func TestRawExec_ExecTaskStreaming_User(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("skip, requires running on Linux for testing custom user setting") - } + clienttestutil.RequireLinux(t) d := newEnabledRawExecDriver(t) harness := dtestutil.NewDriverHarness(t, d) @@ -381,9 +383,7 @@ func TestRawExec_ExecTaskStreaming_User(t *testing.T) { func TestRawExecDriver_NoCgroup(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } + clienttestutil.RequireLinux(t) expectedBytes, err := ioutil.ReadFile("/proc/self/cgroup") require.NoError(t, err) diff --git a/drivers/shared/executor/executor.go b/drivers/shared/executor/executor.go index 5026730b6971..99819d85a66d 100644 --- a/drivers/shared/executor/executor.go +++ b/drivers/shared/executor/executor.go @@ -302,7 +302,8 @@ func (e *UniversalExecutor) Launch(command *ExecCommand) (*ProcessState, error) // Setup cgroups on linux if e.commandCfg.ResourceLimits || e.commandCfg.BasicProcessCgroup { - if err := e.configureResourceContainer(os.Getpid()); err != nil { + pid := os.Getpid() + if err := e.configureResourceContainer(pid); err != nil { return nil, err } } diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 989c4a062c4c..cdb14f64dba1 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -1,10 +1,10 @@ //go:build linux -// +build linux package executor import ( "context" + "errors" "fmt" "io" "os" @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/consul-template/signals" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/stats" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -37,10 +38,6 @@ import ( "golang.org/x/sys/unix" ) -const ( - defaultCgroupParent = "/nomad" -) - var ( // ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1 ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"} @@ -71,7 +68,6 @@ type LibcontainerExecutor struct { } func NewExecutorWithIsolation(logger hclog.Logger) Executor { - logger = logger.Named("isolated_executor") if err := shelpers.Init(); err != nil { logger.Error("unable to initialize stats", "error", err) @@ -665,14 +661,27 @@ func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { } func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { - // If resources are not limited then manually create cgroups needed if !command.ResourceLimits { - return configureBasicCgroups(cfg) + return cgutil.ConfigureBasicCgroups(cfg) } - id := uuid.Generate() - cfg.Cgroups.Path = filepath.Join("/", defaultCgroupParent, id) + // set cgroups path + if cgutil.UseV2 { + // in v2, the cgroup must have been created by the client already, + // which breaks a lot of existing tests that run drivers without a client + if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" { + return errors.New("cgroup path must be set") + } + parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath) + cfg.Cgroups.Path = filepath.Join("/", parent, cgroup) + } else { + // in v1, the cgroup is created using /nomad, which is a bug because it + // does not respect the cgroup_parent client configuration + // (but makes testing easy) + id := uuid.Generate() + cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id) + } if command.Resources == nil || command.Resources.NomadResources == nil { return nil @@ -715,44 +724,6 @@ func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { return nil } -func configureBasicCgroups(cfg *lconfigs.Config) error { - id := uuid.Generate() - - // Manually create freezer cgroup - - subsystem := "freezer" - - path, err := getCgroupPathHelper(subsystem, filepath.Join(defaultCgroupParent, id)) - if err != nil { - return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err) - } - - if err = os.MkdirAll(path, 0755); err != nil { - return err - } - - cfg.Cgroups.Paths = map[string]string{ - subsystem: path, - } - return nil -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) { cfg := &lconfigs.Config{ Cgroups: &lconfigs.Cgroup{ diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 67a991111471..d38072ad561c 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -82,6 +83,12 @@ func testExecutorCommandWithChroot(t *testing.T) *testExecCmd { }, } + if cgutil.UseV2 { + cmd.Resources.LinuxResources = &drivers.LinuxResources{ + CpusetCgroupPath: filepath.Join(cgutil.CgroupRoot, "testing.scope", cgutil.CgroupScope(alloc.ID, task.Name)), + } + } + testCmd := &testExecCmd{ command: cmd, allocDir: allocDir, @@ -164,8 +171,10 @@ func TestExecutor_Isolation_PID_and_IPC_hostMode(t *testing.T) { func TestExecutor_IsolationAndConstraints(t *testing.T) { ci.Parallel(t) - r := require.New(t) testutil.ExecCompatible(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig): hard codes cgroups v1 lookup + + r := require.New(t) testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir @@ -256,9 +265,10 @@ passwd` // hierarchy created for this process func TestExecutor_CgroupPaths(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + require := require.New(t) + testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir execCmd.Cmd = "/bin/bash" @@ -280,27 +290,33 @@ func TestExecutor_CgroupPaths(t *testing.T) { tu.WaitForResult(func() (bool, error) { output := strings.TrimSpace(testExecCmd.stdout.String()) - // Verify that we got some cgroups - if !strings.Contains(output, ":devices:") { - return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) - } - lines := strings.Split(output, "\n") - for _, line := range lines { - // Every cgroup entry should be /nomad/$ALLOC_ID - if line == "" { - continue + switch cgutil.UseV2 { + case true: + isScope := strings.HasSuffix(output, ".scope") + require.True(isScope) + case false: + // Verify that we got some cgroups + if !strings.Contains(output, ":devices:") { + return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) } + lines := strings.Split(output, "\n") + for _, line := range lines { + // Every cgroup entry should be /nomad/$ALLOC_ID + if line == "" { + continue + } - // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker - // don't isolate it by default. - // :: filters out odd empty cgroup found in latest Ubuntu lines, e.g. 0::/user.slice/user-1000.slice/session-17.scope - // that is also not used for isolation - if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { - continue - } + // Skip rdma & misc subsystem; rdma was added in most recent kernels and libcontainer/docker + // don't isolate it by default. + // :: filters out odd empty cgroup found in latest Ubuntu lines, e.g. 0::/user.slice/user-1000.slice/session-17.scope + // that is also not used for isolation + if strings.Contains(line, ":rdma:") || strings.Contains(line, ":misc:") || strings.Contains(line, "::") { + continue + } + if !strings.Contains(line, ":/nomad/") { + return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + } - if !strings.Contains(line, ":/nomad/") { - return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) } } return true, nil @@ -311,9 +327,10 @@ func TestExecutor_CgroupPaths(t *testing.T) { // are destroyed on shutdown func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + require := require.New(t) + testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir execCmd.Cmd = "/bin/bash" @@ -336,28 +353,34 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { var cgroupsPaths string tu.WaitForResult(func() (bool, error) { output := strings.TrimSpace(testExecCmd.stdout.String()) - // Verify that we got some cgroups - if !strings.Contains(output, ":devices:") { - return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) - } - lines := strings.Split(output, "\n") - for _, line := range lines { - // Every cgroup entry should be /nomad/$ALLOC_ID - if line == "" { - continue - } - // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker - // don't isolate it by default. - if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { - continue + switch cgutil.UseV2 { + case true: + isScope := strings.HasSuffix(output, ".scope") + require.True(isScope) + case false: + // Verify that we got some cgroups + if !strings.Contains(output, ":devices:") { + return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) } + lines := strings.Split(output, "\n") + for _, line := range lines { + // Every cgroup entry should be /nomad/$ALLOC_ID + if line == "" { + continue + } - if !strings.Contains(line, ":/nomad/") { - return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker + // don't isolate it by default. + if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { + continue + } + + if !strings.Contains(line, ":/nomad/") { + return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + } } } - cgroupsPaths = output return true, nil }, func(err error) { t.Error(err) }) @@ -383,7 +406,7 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { continue } - p, err := getCgroupPathHelper(subsystem, cgroup) + p, err := cgutil.GetCgroupPathHelperV1(subsystem, cgroup) require.NoError(err) require.Falsef(cgroups.PathExists(p), "cgroup for %s %s still exists", subsystem, cgroup) } @@ -433,8 +456,10 @@ func TestUniversalExecutor_LookupTaskBin(t *testing.T) { // Exec Launch looks for the binary only inside the chroot func TestExecutor_EscapeContainer(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig) kills the terminal, probably defaulting to / + + require := require.New(t) testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir diff --git a/drivers/shared/executor/executor_test.go b/drivers/shared/executor/executor_test.go index 4d930843c1bf..57fbbb015d0e 100644 --- a/drivers/shared/executor/executor_test.go +++ b/drivers/shared/executor/executor_test.go @@ -18,7 +18,9 @@ import ( hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" + "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -90,6 +92,10 @@ func testExecutorCommand(t *testing.T) *testExecCmd { }, } + if cgutil.UseV2 { + cmd.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.scope", cgutil.CgroupScope(alloc.ID, task.Name)) + } + testCmd := &testExecCmd{ command: cmd, allocDir: allocDir, @@ -253,6 +259,8 @@ func TestExecutor_Start_Wait_Children(t *testing.T) { func TestExecutor_WaitExitSignal(t *testing.T) { ci.Parallel(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig) #12351 + for name, factory := range executorFactories { t.Run(name, func(t *testing.T) { testExecCmd := testExecutorCommand(t) @@ -519,6 +527,7 @@ func copyFile(t *testing.T, src, dst string) { func TestExecutor_Start_Kill_Immediately_NoGrace(t *testing.T) { ci.Parallel(t) for name, factory := range executorFactories { + t.Run(name, func(t *testing.T) { require := require.New(t) testExecCmd := testExecutorCommand(t) diff --git a/drivers/shared/executor/executor_universal_linux.go b/drivers/shared/executor/executor_universal_linux.go index d5076a8d3d6a..7c97681df55c 100644 --- a/drivers/shared/executor/executor_universal_linux.go +++ b/drivers/shared/executor/executor_universal_linux.go @@ -10,6 +10,7 @@ import ( "github.com/containernetworking/plugins/pkg/ns" multierror "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/plugins/drivers" "github.com/opencontainers/runc/libcontainer/cgroups" cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" @@ -77,8 +78,7 @@ func (e *UniversalExecutor) configureResourceContainer(pid int) error { cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule) } - err := configureBasicCgroups(cfg) - if err != nil { + if err := cgutil.ConfigureBasicCgroups(cfg); err != nil { // Log this error to help diagnose cases where nomad is run with too few // permissions, but don't return an error. There is no separate check for // cgroup creation permissions, so this may be the happy path. diff --git a/drivers/shared/executor/client.go b/drivers/shared/executor/grpc_client.go similarity index 100% rename from drivers/shared/executor/client.go rename to drivers/shared/executor/grpc_client.go diff --git a/drivers/shared/executor/server.go b/drivers/shared/executor/grpc_server.go similarity index 100% rename from drivers/shared/executor/server.go rename to drivers/shared/executor/grpc_server.go diff --git a/drivers/shared/executor/resource_container_linux.go b/drivers/shared/executor/resource_container_linux.go index 50ab7cef9e6e..dbdf83f16282 100644 --- a/drivers/shared/executor/resource_container_linux.go +++ b/drivers/shared/executor/resource_container_linux.go @@ -30,6 +30,7 @@ func (rc *resourceContainerContext) isEmpty() bool { return rc.groups == nil } +// todo(shoenig) cgroups.v2 #12351 func (rc *resourceContainerContext) getAllPidsByCgroup() (map[int]*nomadPid, error) { nPids := map[int]*nomadPid{} diff --git a/go.mod b/go.mod index 0218687a452d..11a3750bdcc3 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,7 @@ require ( github.com/mitchellh/mapstructure v1.4.3 github.com/mitchellh/reflectwalk v1.0.2 github.com/moby/sys/mount v0.3.0 - github.com/moby/sys/mountinfo v0.5.0 + github.com/moby/sys/mountinfo v0.6.0 github.com/opencontainers/runc v1.0.3 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 github.com/pkg/errors v0.9.1 @@ -162,9 +162,9 @@ require ( github.com/boltdb/bolt v1.3.1 // indirect github.com/census-instrumentation/opencensus-proto v0.3.0 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect - github.com/checkpoint-restore/go-criu/v5 v5.0.0 // indirect + github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cheggaaa/pb/v3 v3.0.5 // indirect - github.com/cilium/ebpf v0.6.2 // indirect + github.com/cilium/ebpf v0.8.1 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4 // indirect @@ -173,7 +173,7 @@ require ( github.com/containerd/console v1.0.3 // indirect github.com/containerd/containerd v1.5.9 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect - github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 // indirect + github.com/cyphar/filepath-securejoin v0.2.3 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba // indirect github.com/digitalocean/godo v1.10.0 // indirect @@ -186,7 +186,7 @@ require ( github.com/envoyproxy/protoc-gen-validate v0.6.2 // indirect github.com/felixge/httpsnoop v1.0.1 // indirect github.com/go-ole/go-ole v1.2.6 // indirect - github.com/godbus/dbus/v5 v5.0.4 // indirect + github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/gojuno/minimock/v3 v3.0.6 // indirect github.com/golang-jwt/jwt/v4 v4.0.0 // indirect @@ -232,7 +232,7 @@ require ( github.com/oklog/run v1.0.1-0.20180308005104-6934b124db28 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.0.2 // indirect - github.com/opencontainers/selinux v1.8.2 // indirect + github.com/opencontainers/selinux v1.10.0 // indirect github.com/packethost/packngo v0.1.1-0.20180711074735-b9cb5096f54c // indirect github.com/pierrec/lz4 v2.5.2+incompatible // indirect github.com/pmezard/go-difflib v1.0.0 // indirect @@ -241,7 +241,7 @@ require ( github.com/prometheus/procfs v0.7.3 // indirect github.com/renier/xmlrpc v0.0.0-20170708154548-ce4a1a486c03 // indirect github.com/rogpeppe/go-internal v1.6.1 // indirect - github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128 // indirect + github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 // indirect github.com/sirupsen/logrus v1.8.1 // indirect github.com/softlayer/softlayer-go v0.0.0-20180806151055-260589d94c7d // indirect github.com/stretchr/objx v0.2.0 // indirect diff --git a/go.sum b/go.sum index 9a472d9f02ae..ff8ba072fda7 100644 --- a/go.sum +++ b/go.sum @@ -209,8 +209,9 @@ github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/checkpoint-restore/go-criu/v4 v4.1.0/go.mod h1:xUQBLp4RLc5zJtWY++yjOoMoB5lihDt7fai+75m+rGw= -github.com/checkpoint-restore/go-criu/v5 v5.0.0 h1:TW8f/UvntYoVDMN1K2HlT82qH1rb0sOjpGw3m6Ym+i4= github.com/checkpoint-restore/go-criu/v5 v5.0.0/go.mod h1:cfwC0EG7HMUenopBsUf9d89JlCLQIfgVcNsNN0t6T2M= +github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8= +github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E= github.com/cheggaaa/pb v1.0.27 h1:wIkZHkNfC7R6GI5w7l/PdAdzXzlrbcI3p8OAlnkTsnc= github.com/cheggaaa/pb v1.0.27/go.mod h1:pQciLPpbU0oxA0h+VJYYLxO+XeDQb5pZijXscXHm81s= github.com/cheggaaa/pb/v3 v3.0.5 h1:lmZOti7CraK9RSjzExsY53+WWfub9Qv13B5m4ptEoPE= @@ -222,8 +223,9 @@ github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmE github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775/go.mod h1:7cR51M8ViRLIdUjrmSXlK9pkrsDlLHbO8jiB8X8JnOc= github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= -github.com/cilium/ebpf v0.6.2 h1:iHsfF/t4aW4heW2YKfeHrVPGdtYTL4C4KocpM8KTSnI= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= +github.com/cilium/ebpf v0.8.1 h1:bLSSEbBLqGPXxls55pGr5qWZaTqcmfDJHhou7t254ao= +github.com/cilium/ebpf v0.8.1/go.mod h1:f5zLIM0FSNuAkSyLAN7X+Hy6yznlF1mNiWUMfxMtrgk= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible h1:C29Ae4G5GtYyYMm1aztcyj/J5ckgJm2zwdDajFbx1NY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3 h1:TJH+oke8D16535+jHExHj4nQvzlZrj7ug5D7I/orNUA= @@ -372,8 +374,8 @@ github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ github.com/creack/pty v1.1.17 h1:QeVUsEDNrLBW4tMgZHvxy18sKtr6VI492kBhUfhDJNI= github.com/creack/pty v1.1.17/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= -github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 h1:dCqRswe3ZAwkQWdvFLwRqmJCpGP3DWb7bFogdqY3+QU= -github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= +github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= +github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ= github.com/d2g/dhcp4client v1.0.0/go.mod h1:j0hNfjhrt2SxUOw55nL0ATM/z4Yt3t2Kd1mW34z5W5s= github.com/d2g/dhcp4server v0.0.0-20181031114812-7d4a0a7f59a5/go.mod h1:Eo87+Kg/IX2hfWJfwxMzLyuSZyxSoAug2nGa1G2QAi8= @@ -462,8 +464,9 @@ github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/frankban/quicktest v1.4.0/go.mod h1:36zfPVQyHxymz4cH7wlDmVwDrJuljRB60qkgn7rorfQ= github.com/frankban/quicktest v1.10.0/go.mod h1:ui7WezCLWMWxVWr1GETZY3smRy0G4KWq9vcPtJmFl7Y= -github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/frankban/quicktest v1.14.0 h1:+cqqvzZV87b4adx/5ayVOaYZ2CrvM4ejQvUdBzPPUss= +github.com/frankban/quicktest v1.14.0/go.mod h1:NeW+ay9A/U67EYXNFA1nPE8e/tnQv/09mUdL/ijj8og= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= @@ -515,8 +518,9 @@ github.com/godbus/dbus v0.0.0-20180201030542-885f9cc04c9c/go.mod h1:/YcGZj5zSblf github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e h1:BWhy2j3IXJhjCbC68FptL43tDKIq8FladmaTs3Xs7Z8= github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= +github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/googleapis v1.2.0/go.mod h1:Njal3psf3qN6dwBtQfUmBZh2ybovJ0tlu3o/AC7HYjU= github.com/gogo/googleapis v1.4.0/go.mod h1:5YRNX2z1oM5gXdAkurHa942MDgEJyk02w4OecKY87+c= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= @@ -985,8 +989,9 @@ github.com/moby/sys/mount v0.3.0 h1:bXZYMmq7DBQPwHRxH/MG+u9+XF90ZOwoXpHTOznMGp0= github.com/moby/sys/mount v0.3.0/go.mod h1:U2Z3ur2rXPFrFmy4q6WMwWrBOAQGYtYTRVM8BIvzbwk= github.com/moby/sys/mountinfo v0.4.0/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= github.com/moby/sys/mountinfo v0.4.1/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= -github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI= github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= +github.com/moby/sys/mountinfo v0.6.0 h1:gUDhXQx58YNrpHlK4nSL+7y2pxFZkUcXqzFDKWdC0Oo= +github.com/moby/sys/mountinfo v0.6.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= github.com/moby/sys/symlink v0.1.0/go.mod h1:GGDODQmbFOjFsXvfLVn3+ZRxkch54RkSiGqsZeMYowQ= github.com/moby/term v0.0.0-20200312100748-672ec06f55cd/go.mod h1:DdlQx2hp0Ss5/fLikoLlEeIYiATotOjgB//nb973jeo= github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 h1:dcztxKSvZ4Id8iPpHERQBbIJfabdt4wUm5qy3wOL2Zc= @@ -1064,8 +1069,9 @@ github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.m github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mod h1:r3f7wjNzSs2extwzU3Y+6pKfobzPh+kKFJ3ofN+3nfs= github.com/opencontainers/selinux v1.6.0/go.mod h1:VVGKuOLlE7v4PJyT6h7mNWvq1rzqiriPsEqVhc+svHE= github.com/opencontainers/selinux v1.8.0/go.mod h1:RScLhm78qiWa2gbVCcGkC7tCGdgk3ogry1nUQF8Evvo= -github.com/opencontainers/selinux v1.8.2 h1:c4ca10UMgRcvZ6h0K4HtS15UaVSBEaE+iln2LVpAuGc= github.com/opencontainers/selinux v1.8.2/go.mod h1:MUIHuUEvKB1wtJjQdOyYRgOnLD2xAPP8dBsCoU0KuF8= +github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= +github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/packethost/packngo v0.1.1-0.20180711074735-b9cb5096f54c h1:vwpFWvAO8DeIZfFeqASzZfsxuWPno9ncAebBEP0N3uE= @@ -1160,8 +1166,8 @@ github.com/sclevine/agouti v3.0.0+incompatible/go.mod h1:b4WX9W9L1sfQKXeJf1mUTLZ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= -github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128 h1:vWI7jlfJVWN//T8jrt5JduYkSid+Sl/fRz33J1jQ83k= -github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/shirou/gopsutil v0.0.0-20181107111621-48177ef5f880 h1:1Ge4j/3uB2rxzPWD3TC+daeCw+w91z8UCUL/7WH5gn8= @@ -1570,6 +1576,7 @@ golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210816074244-15123e1e1f71/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210816183151-1e6c022a8912/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210917161153-d61c044b1678/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211013075003-97ac67df715c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/lib/cpuset/cpuset.go b/lib/cpuset/cpuset.go index 0caa966710d3..9cea06cad877 100644 --- a/lib/cpuset/cpuset.go +++ b/lib/cpuset/cpuset.go @@ -28,6 +28,17 @@ func New(cpus ...uint16) CPUSet { return cpuset } +// Copy returns a deep copy of CPUSet c. +func (c CPUSet) Copy() CPUSet { + cpus := make(map[uint16]struct{}, len(c.cpus)) + for k := range c.cpus { + cpus[k] = struct{}{} + } + return CPUSet{ + cpus: cpus, + } +} + // String returns the cpuset as a comma delimited set of core values and ranged func (c CPUSet) String() string { if c.Size() == 0 { diff --git a/lib/cpuset/cpuset_test.go b/lib/cpuset/cpuset_test.go index 178517680b12..e417744b03d8 100644 --- a/lib/cpuset/cpuset_test.go +++ b/lib/cpuset/cpuset_test.go @@ -207,7 +207,7 @@ func TestParse(t *testing.T) { func TestCPUSet_String(t *testing.T) { ci.Parallel(t) - + cases := []struct { cpuset CPUSet expected string @@ -222,3 +222,16 @@ func TestCPUSet_String(t *testing.T) { require.Equal(t, c.expected, c.cpuset.String()) } } + +func TestCPUSet_Copy(t *testing.T) { + ci.Parallel(t) + + original := New(1, 2, 3, 4, 5) + copied := original.Copy() + require.True(t, original.Equals(copied)) + + delete(copied.cpus, 3) + require.False(t, original.Equals(copied)) + require.True(t, original.Equals(New(1, 2, 3, 4, 5))) + require.True(t, copied.Equals(New(1, 2, 4, 5))) +} diff --git a/plugins/drivers/driver.go b/plugins/drivers/driver.go index f47dc5a2815b..4b8cd7d13531 100644 --- a/plugins/drivers/driver.go +++ b/plugins/drivers/driver.go @@ -267,7 +267,7 @@ type TaskConfig struct { JobName string JobID string TaskGroupName string - Name string + Name string // task.Name Namespace string NodeName string NodeID string diff --git a/plugins/drivers/testutils/exec_testing.go b/plugins/drivers/testutils/exec_testing.go index 8d824a16d9dc..a1cd361b11d9 100644 --- a/plugins/drivers/testutils/exec_testing.go +++ b/plugins/drivers/testutils/exec_testing.go @@ -14,8 +14,10 @@ import ( "testing" "time" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/plugins/drivers" dproto "github.com/hashicorp/nomad/plugins/drivers/proto" + "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -192,9 +194,35 @@ func TestExecFSIsolation(t *testing.T, driver *DriverHarness, taskID string) { false, "") require.Zero(t, r.exitCode) - if !strings.Contains(r.stdout, ":freezer:/nomad") && !strings.Contains(r.stdout, "freezer:/docker") { - require.Fail(t, "unexpected freezer cgroup", "expected freezer to be /nomad/ or /docker/, but found:\n%s", r.stdout) + if !cgutil.UseV2 { + acceptable := []string{ + ":freezer:/nomad", ":freezer:/docker", + } + if testutil.IsCI() { + // github actions freezer cgroup + acceptable = append(acceptable, ":freezer:/actions_job") + } + ok := false + for _, freezerGroup := range acceptable { + if strings.Contains(r.stdout, freezerGroup) { + ok = true + break + } + } + if !ok { + require.Fail(t, "unexpected freezer cgroup", "expected freezer to be /nomad/ or /docker/, but found:\n%s", r.stdout) + } + } else { + info, _ := driver.PluginInfo() + if info.Name == "docker" { + // Note: docker on cgroups v2 now returns nothing + // root@97b4d3d33035:/# cat /proc/self/cgroup + // 0::/ + t.Skip("/proc/self/cgroup not useful in docker cgroups.v2") + } + // e.g. 0::/testing.slice/5bdbd6c2-8aba-3ab2-728b-0ff3a81727a9.sleep.scope + require.True(t, strings.HasSuffix(strings.TrimSpace(r.stdout), ".scope"), "actual stdout %q", r.stdout) } }) } diff --git a/plugins/drivers/testutils/testing.go b/plugins/drivers/testutils/testing.go index 5056344e0824..f0f6c6713c2c 100644 --- a/plugins/drivers/testutils/testing.go +++ b/plugins/drivers/testutils/testing.go @@ -4,17 +4,17 @@ import ( "context" "fmt" "io/ioutil" + "os" "path/filepath" "runtime" "strings" "time" - testing "github.com/mitchellh/go-testing-interface" - hclog "github.com/hashicorp/go-hclog" plugin "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/logmon" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/helper/testlog" @@ -24,6 +24,7 @@ import ( "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/drivers" "github.com/hashicorp/nomad/plugins/shared/hclspec" + testing "github.com/mitchellh/go-testing-interface" "github.com/stretchr/testify/require" ) @@ -34,6 +35,7 @@ type DriverHarness struct { t testing.T logger hclog.Logger impl drivers.DriverPlugin + cgroup string } func (h *DriverHarness) Impl() drivers.DriverPlugin { @@ -53,12 +55,10 @@ func NewDriverHarness(t testing.T, d drivers.DriverPlugin) *DriverHarness { ) raw, err := client.Dispense(base.PluginTypeDriver) - if err != nil { - t.Fatalf("err dispensing plugin: %v", err) - } + require.NoError(t, err, "failed to dispense plugin") dClient := raw.(drivers.DriverPlugin) - h := &DriverHarness{ + return &DriverHarness{ client: client, server: server, DriverPlugin: dClient, @@ -66,13 +66,36 @@ func NewDriverHarness(t testing.T, d drivers.DriverPlugin) *DriverHarness { t: t, impl: d, } +} - return h +// setCgroup creates a v2 cgroup for the task, as if a Client were initialized +// and managing the cgroup as it normally would. +// +// Uses testing.slice as a parent. +func (h *DriverHarness) setCgroup(allocID, task string) { + if cgutil.UseV2 { + h.cgroup = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, task)) + f, err := os.Create(h.cgroup) + if err != nil { + panic(err) + } + defer f.Close() + } } func (h *DriverHarness) Kill() { - h.client.Close() + _ = h.client.Close() h.server.Stop() + h.cleanupCgroup() +} + +func (h *DriverHarness) cleanupCgroup() { + if cgutil.UseV2 { + err := os.Remove(h.cgroup) + if err != nil { + panic(err) + } + } } // tinyChroot is useful for testing, where we do not use anything other than @@ -139,6 +162,7 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( // Create the mock allocation alloc := mock.Alloc() + alloc.ID = t.AllocID if t.Resources != nil { alloc.AllocatedResources.Tasks[task.Name] = t.Resources.NomadResources } @@ -157,6 +181,9 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( } } + // set cgroup + h.setCgroup(alloc.ID, task.Name) + //logmon if enableLogs { lm := logmon.NewLogMon(h.logger.Named("logmon")) @@ -189,6 +216,7 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( return func() { h.client.Close() allocDir.Destroy() + h.cleanupCgroup() } } diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 56d2d7c44f56..874d2158e473 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -94,6 +94,29 @@ be removed in future releases. The previous `Protocol` value can be viewed using the `-verbose` flag. +#### Linux Control Groups Version 2 + +Starting with Nomad 1.3.0, Linux systems configured to use [cgroups v2][cgroups2] are +now supported. A Nomad client will only activate its v2 control groups manager if the +system is configured with the cgroups2 controller mounted at `/sys/fs/cgroup`. This implies +Nomad will continue to fallback to the v1 control groups manager on systems +configured to run in hybrid mode, where the cgroups2 controller is typically mounted +at `/sys/fs/cgroup/unified`. Systems that do not support cgroups v2 are not affected. A +new client attribute `unique.cgroup.version` indicates which version of control groups +Nomad is using. + +When cgroups v2 are in use, Nomad uses `nomad.slice` as the [default parent][cgroup_parent] for cgroups +created on behalf of tasks. The cgroup created for a task is named in the form `-.scope`. +These cgroups are created by Nomad before a task starts. External task drivers that support +containerization should be updated to make use of the new cgroup locations. + +``` +➜ tree -d /sys/fs/cgroup/nomad.slice +/sys/fs/cgroup/nomad.slice +├── 8b8da4cf-8ebf-b578-0bcf-77190749abf3.redis.scope +└── a8c8e495-83c8-311b-4657-e6e3127e98bc.example.scope +``` + ## Nomad 1.2.6, 1.1.12, and 1.0.18 #### ACL requirement for the job parse endpoint @@ -1262,6 +1285,8 @@ state. Once that is done the client can be killed, the `data_dir` should be deleted and then Nomad 0.3.0 can be launched. [api_jobs_parse]: /api-docs/jobs#parse-job +[cgroups2]: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html +[cgroup_parent]: /docs/configuration/client#cgroup_parent [dangling-containers]: /docs/drivers/docker#dangling-containers [drain-api]: /api-docs/nodes#drain-node [drain-cli]: /docs/commands/node/drain