diff --git a/.changelog/12274.txt b/.changelog/12274.txt new file mode 100644 index 000000000000..3f47ae714a62 --- /dev/null +++ b/.changelog/12274.txt @@ -0,0 +1,3 @@ +```release-note:improvement +Enable support for cgroups v2 +``` diff --git a/.github/workflows/test-core.yaml b/.github/workflows/test-core.yaml index 5f2a66d3b841..672458ef3b9d 100644 --- a/.github/workflows/test-core.yaml +++ b/.github/workflows/test-core.yaml @@ -96,7 +96,7 @@ jobs: - client/devicemanager - client/dynamicplugins - client/fingerprint - # - client/lib/... + - client/lib/... - client/logmon - client/pluginmanager - client/state @@ -105,8 +105,8 @@ jobs: - client/taskenv - command - command/agent - # - drivers/docker - # - drivers/exec + - drivers/docker + - drivers/exec - drivers/java - drivers/rawexec - helper/... diff --git a/client/alloc_endpoint_test.go b/client/alloc_endpoint_test.go index 2e51a3702c46..272819ecc35c 100644 --- a/client/alloc_endpoint_test.go +++ b/client/alloc_endpoint_test.go @@ -992,6 +992,7 @@ func TestAlloc_ExecStreaming_ACL_WithIsolation_Image(t *testing.T) { // TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot asserts that token only needs // alloc-exec acl policy when chroot isolation is used func TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot(t *testing.T) { + ci.SkipSlow(t, "flaky on GHA; too much disk IO") ci.Parallel(t) if runtime.GOOS != "linux" || unix.Geteuid() != 0 { diff --git a/client/allocrunner/testing.go b/client/allocrunner/testing.go index da850719ef4e..41ee296ff838 100644 --- a/client/allocrunner/testing.go +++ b/client/allocrunner/testing.go @@ -70,7 +70,7 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu PrevAllocMigrator: allocwatcher.NoopPrevAlloc{}, DeviceManager: devicemanager.NoopMockManager(), DriverManager: drivermanager.TestDriverManager(t), - CpusetManager: cgutil.NoopCpusetManager(), + CpusetManager: new(cgutil.NoopCpusetManager), ServersContactedCh: make(chan struct{}), } return conf, cleanup diff --git a/client/client.go b/client/client.go index 826453170d8c..5b473a7fc773 100644 --- a/client/client.go +++ b/client/client.go @@ -365,7 +365,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie invalidAllocs: make(map[string]struct{}), serversContactedCh: make(chan struct{}), serversContactedOnce: sync.Once{}, - cpusetManager: cgutil.NewCpusetManager(cfg.CgroupParent, logger.Named("cpuset_manager")), + cpusetManager: cgutil.CreateCPUSetManager(cfg.CgroupParent, logger), EnterpriseClient: newEnterpriseClient(logger), } @@ -657,19 +657,23 @@ func (c *Client) init() error { // Ensure cgroups are created on linux platform if runtime.GOOS == "linux" && c.cpusetManager != nil { - err := c.cpusetManager.Init() - if err != nil { - // if the client cannot initialize the cgroup then reserved cores will not be reported and the cpuset manager - // will be disabled. this is common when running in dev mode under a non-root user for example - c.logger.Warn("could not initialize cpuset cgroup subsystem, cpuset management disabled", "error", err) - c.cpusetManager = cgutil.NoopCpusetManager() + // use the client configuration for reservable_cores if set + cores := c.config.ReservableCores + if len(cores) == 0 { + // otherwise lookup the effective cores from the parent cgroup + cores, _ = cgutil.GetCPUsFromCgroup(c.config.CgroupParent) + } + if cpuErr := c.cpusetManager.Init(cores); cpuErr != nil { + // If the client cannot initialize the cgroup then reserved cores will not be reported and the cpuset manager + // will be disabled. this is common when running in dev mode under a non-root user for example. + c.logger.Warn("failed to initialize cpuset cgroup subsystem, cpuset management disabled", "error", cpuErr) + c.cpusetManager = new(cgutil.NoopCpusetManager) } } return nil } -// reloadTLSConnections allows a client to reload its TLS configuration on the -// fly +// reloadTLSConnections allows a client to reload its TLS configuration on the fly func (c *Client) reloadTLSConnections(newConfig *nconfig.TLSConfig) error { var tlsWrap tlsutil.RegionWrapper if newConfig != nil && newConfig.EnableRPC { diff --git a/client/client_test.go b/client/client_test.go index bb99e6cc9208..c8cdb0f0a282 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/nomad/client/config" consulApi "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/fingerprint" + cstate "github.com/hashicorp/nomad/client/state" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/pluginutils/catalog" "github.com/hashicorp/nomad/helper/pluginutils/singleton" @@ -30,8 +31,6 @@ import ( psstructs "github.com/hashicorp/nomad/plugins/shared/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" - - cstate "github.com/hashicorp/nomad/client/state" "github.com/stretchr/testify/require" ) diff --git a/client/config/config.go b/client/config/config.go index 9ec256eabb6d..4767cd12e1ec 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -774,7 +774,7 @@ func DefaultConfig() *Config { CNIConfigDir: "/opt/cni/config", CNIInterfacePrefix: "eth", HostNetworks: map[string]*structs.ClientHostNetworkConfig{}, - CgroupParent: cgutil.DefaultCgroupParent, + CgroupParent: cgutil.GetCgroupParent(""), MaxDynamicPort: structs.DefaultMinDynamicPort, MinDynamicPort: structs.DefaultMaxDynamicPort, } diff --git a/client/fingerprint/cgroup.go b/client/fingerprint/cgroup.go index b0097984c79f..b24c3f0a3442 100644 --- a/client/fingerprint/cgroup.go +++ b/client/fingerprint/cgroup.go @@ -3,55 +3,86 @@ package fingerprint import ( "time" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/lib/cgutil" - - log "github.com/hashicorp/go-hclog" ) const ( + cgroupAvailable = "available" cgroupUnavailable = "unavailable" - interval = 15 + + cgroupMountPointAttribute = "unique.cgroup.mountpoint" + cgroupVersionAttribute = "unique.cgroup.version" + + cgroupDetectInterval = 15 * time.Second ) type CGroupFingerprint struct { - logger log.Logger + logger hclog.Logger lastState string mountPointDetector MountPointDetector + versionDetector CgroupVersionDetector } -// An interface to isolate calls to the cgroup library -// This facilitates testing where we can implement -// fake mount points to test various code paths +// MountPointDetector isolates calls to the cgroup library. +// +// This facilitates testing where we can implement fake mount points to test +// various code paths. type MountPointDetector interface { + // MountPoint returns a cgroup mount-point. + // + // In v1, this is one arbitrary subsystem (e.g. /sys/fs/cgroup/cpu). + // + // In v2, this is the actual root mount point (i.e. /sys/fs/cgroup). MountPoint() (string, error) } -// Implements the interface detector which calls the cgroups library directly +// DefaultMountPointDetector implements the interface detector which calls the cgroups +// library directly type DefaultMountPointDetector struct { } // MountPoint calls out to the default cgroup library. -func (b *DefaultMountPointDetector) MountPoint() (string, error) { +func (*DefaultMountPointDetector) MountPoint() (string, error) { return cgutil.FindCgroupMountpointDir() } +// CgroupVersionDetector isolates calls to the cgroup library. +type CgroupVersionDetector interface { + // CgroupVersion returns v1 or v2 depending on the cgroups version in use. + CgroupVersion() string +} + +// DefaultCgroupVersionDetector implements the version detector which calls the +// cgroups library directly. +type DefaultCgroupVersionDetector struct { +} + +func (*DefaultCgroupVersionDetector) CgroupVersion() string { + if cgutil.UseV2 { + return "v2" + } + return "v1" +} + // NewCGroupFingerprint returns a new cgroup fingerprinter -func NewCGroupFingerprint(logger log.Logger) Fingerprint { - f := &CGroupFingerprint{ +func NewCGroupFingerprint(logger hclog.Logger) Fingerprint { + return &CGroupFingerprint{ logger: logger.Named("cgroup"), lastState: cgroupUnavailable, - mountPointDetector: &DefaultMountPointDetector{}, + mountPointDetector: new(DefaultMountPointDetector), + versionDetector: new(DefaultCgroupVersionDetector), } - return f } // clearCGroupAttributes clears any node attributes related to cgroups that might // have been set in a previous fingerprint run. func (f *CGroupFingerprint) clearCGroupAttributes(r *FingerprintResponse) { - r.RemoveAttribute("unique.cgroup.mountpoint") + r.RemoveAttribute(cgroupMountPointAttribute) + r.RemoveAttribute(cgroupVersionAttribute) } // Periodic determines the interval at which the periodic fingerprinter will run. func (f *CGroupFingerprint) Periodic() (bool, time.Duration) { - return true, interval * time.Second + return true, cgroupDetectInterval } diff --git a/client/fingerprint/cgroup_default.go b/client/fingerprint/cgroup_default.go index 272540834ea8..f1e53c603885 100644 --- a/client/fingerprint/cgroup_default.go +++ b/client/fingerprint/cgroup_default.go @@ -1,5 +1,4 @@ //go:build !linux -// +build !linux package fingerprint diff --git a/client/fingerprint/cgroup_linux.go b/client/fingerprint/cgroup_linux.go index 36cefdf0f310..a951dca6552a 100644 --- a/client/fingerprint/cgroup_linux.go +++ b/client/fingerprint/cgroup_linux.go @@ -1,5 +1,4 @@ //go:build linux -// +build linux package fingerprint @@ -7,31 +6,30 @@ import ( "fmt" ) -const ( - cgroupAvailable = "available" -) - -// Fingerprint tries to find a valid cgroup mount point +// Fingerprint tries to find a valid cgroup mount point and the version of cgroups +// if a mount-point is present. func (f *CGroupFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error { mount, err := f.mountPointDetector.MountPoint() if err != nil { f.clearCGroupAttributes(resp) - return fmt.Errorf("Failed to discover cgroup mount point: %s", err) + return fmt.Errorf("failed to discover cgroup mount point: %s", err) } - // Check if a cgroup mount point was found + // Check if a cgroup mount point was found. if mount == "" { - f.clearCGroupAttributes(resp) - if f.lastState == cgroupAvailable { - f.logger.Info("cgroups are unavailable") + f.logger.Warn("cgroups are now unavailable") } f.lastState = cgroupUnavailable return nil } - resp.AddAttribute("unique.cgroup.mountpoint", mount) + // Check the version in use. + version := f.versionDetector.CgroupVersion() + + resp.AddAttribute(cgroupMountPointAttribute, mount) + resp.AddAttribute(cgroupVersionAttribute, version) resp.Detected = true if f.lastState == cgroupUnavailable { diff --git a/client/fingerprint/cgroup_test.go b/client/fingerprint/cgroup_test.go index 11119b1d09e7..23205169ab27 100644 --- a/client/fingerprint/cgroup_test.go +++ b/client/fingerprint/cgroup_test.go @@ -1,5 +1,4 @@ //go:build linux -// +build linux package fingerprint @@ -11,6 +10,7 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) // A fake mount point detector that returns an empty path @@ -41,94 +41,116 @@ func (m *MountPointDetectorEmptyMountPoint) MountPoint() (string, error) { return "", nil } -func TestCGroupFingerprint(t *testing.T) { +// A fake version detector that returns the set version. +type FakeVersionDetector struct { + version string +} + +func (f *FakeVersionDetector) CgroupVersion() string { + return f.version +} + +func newRequest(node *structs.Node) *FingerprintRequest { + return &FingerprintRequest{ + Config: new(config.Config), + Node: node, + } +} + +func newNode() *structs.Node { + return &structs.Node{ + Attributes: make(map[string]string), + } +} + +func TestCgroup_MountPoint(t *testing.T) { ci.Parallel(t) - { + t.Run("mount-point fail", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorMountPointFail{}, - } - - node := &structs.Node{ - Attributes: make(map[string]string), + mountPointDetector: new(MountPointDetectorMountPointFail), + versionDetector: new(DefaultCgroupVersionDetector), } - request := &FingerprintRequest{Config: &config.Config{}, Node: node} + request := newRequest(newNode()) var response FingerprintResponse err := f.Fingerprint(request, &response) - if err == nil { - t.Fatalf("expected an error") - } - - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a != "" { - t.Fatalf("unexpected attribute found, %s", a) - } - } + require.EqualError(t, err, "failed to discover cgroup mount point: cgroup mountpoint discovery failed") + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) - { + t.Run("mount-point available", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorValidMountPoint{}, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), - } - - request := &FingerprintRequest{Config: &config.Config{}, Node: node} + request := newRequest(newNode()) var response FingerprintResponse err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, ok := response.Attributes["unique.cgroup.mountpoint"]; !ok { - t.Fatalf("unable to find attribute: %s", a) - } - } + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", response.Attributes[cgroupMountPointAttribute]) + }) - { + t.Run("mount-point empty", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorEmptyMountPoint{}, + mountPointDetector: new(MountPointDetectorEmptyMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), - } - - request := &FingerprintRequest{Config: &config.Config{}, Node: node} var response FingerprintResponse - err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a != "" { - t.Fatalf("unexpected attribute found, %s", a) - } - } - { + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) + + t.Run("mount-point already present", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupAvailable, - mountPointDetector: &MountPointDetectorValidMountPoint{}, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), + var response FingerprintResponse + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", response.Attributes[cgroupMountPointAttribute]) + }) +} + +func TestCgroup_Version(t *testing.T) { + t.Run("version v1", func(t *testing.T) { + f := &CGroupFingerprint{ + logger: testlog.HCLogger(t), + lastState: cgroupUnavailable, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: &FakeVersionDetector{version: "v1"}, } - request := &FingerprintRequest{Config: &config.Config{}, Node: node} var response FingerprintResponse - err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a == "" { - t.Fatalf("expected attribute to be found, %s", a) + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Equal(t, "v1", response.Attributes[cgroupVersionAttribute]) + }) + + t.Run("without mount-point", func(t *testing.T) { + f := &CGroupFingerprint{ + logger: testlog.HCLogger(t), + lastState: cgroupUnavailable, + mountPointDetector: new(MountPointDetectorEmptyMountPoint), + versionDetector: &FakeVersionDetector{version: "v1"}, } - } + + var response FingerprintResponse + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) } diff --git a/client/fingerprint/cpu_linux.go b/client/fingerprint/cpu_linux.go index a31b8e60bf67..14c80faa3aa7 100644 --- a/client/fingerprint/cpu_linux.go +++ b/client/fingerprint/cpu_linux.go @@ -5,9 +5,8 @@ import ( ) func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) { - parent := req.Config.CgroupParent - if parent == "" { - parent = cgutil.DefaultCgroupParent - } - return cgutil.GetCPUsFromCgroup(parent) + // The cpuset cgroup manager is initialized (on linux), but not accessible + // from the finger-printer. So we reach in and grab the information manually. + // We may assume the hierarchy is already setup. + return cgutil.GetCPUsFromCgroup(req.Config.CgroupParent) } diff --git a/client/lib/cgutil/cgutil_default.go b/client/lib/cgutil/cgutil_default.go deleted file mode 100644 index c2fc74e0da28..000000000000 --- a/client/lib/cgutil/cgutil_default.go +++ /dev/null @@ -1,14 +0,0 @@ -//go:build !linux -// +build !linux - -package cgutil - -const ( - DefaultCgroupParent = "" -) - -// FindCgroupMountpointDir is used to find the cgroup mount point on a Linux -// system. Here it is a no-op implemtation -func FindCgroupMountpointDir() (string, error) { - return "", nil -} diff --git a/client/lib/cgutil/cgutil_linux.go b/client/lib/cgutil/cgutil_linux.go index e8b867aa1f19..349660ec3e6b 100644 --- a/client/lib/cgutil/cgutil_linux.go +++ b/client/lib/cgutil/cgutil_linux.go @@ -1,121 +1,106 @@ +//go:build linux + package cgutil import ( + "fmt" "os" "path/filepath" - cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper/uuid" "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" - "golang.org/x/sys/unix" -) - -const ( - DefaultCgroupParent = "/nomad" - SharedCpusetCgroupName = "shared" - ReservedCpusetCgroupName = "reserved" + lcc "github.com/opencontainers/runc/libcontainer/configs" ) -func GetCPUsFromCgroup(group string) ([]uint16, error) { - cgroupPath, err := getCgroupPathHelper("cpuset", group) - if err != nil { - return nil, err - } +// UseV2 indicates whether only cgroups.v2 is enabled. If cgroups.v2 is not +// enabled or is running in hybrid mode with cgroups.v1, Nomad will make use of +// cgroups.v1 +// +// This is a read-only value. +var UseV2 = cgroups.IsCgroup2UnifiedMode() + +// GetCgroupParent returns the mount point under the root cgroup in which Nomad +// will create cgroups. If parent is not set, an appropriate name for the version +// of cgroups will be used. +func GetCgroupParent(parent string) string { + if UseV2 { + return getParentV2(parent) + } + return getParentV1(parent) +} - man := cgroupFs.NewManager(&configs.Cgroup{Path: group}, map[string]string{"cpuset": cgroupPath}, false) - stats, err := man.GetStats() - if err != nil { - return nil, err +// CreateCPUSetManager creates a V1 or V2 CpusetManager depending on system configuration. +func CreateCPUSetManager(parent string, logger hclog.Logger) CpusetManager { + if UseV2 { + return NewCpusetManagerV2(getParentV2(parent), logger.Named("cpuset.v2")) } - return stats.CPUSetStats.CPUs, nil + return NewCpusetManagerV1(getParentV1(parent), logger.Named("cpuset.v1")) } -func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { - if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil { - return - } - if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil { - return +// GetCPUsFromCgroup gets the effective cpuset value for the given cgroup. +func GetCPUsFromCgroup(group string) ([]uint16, error) { + if UseV2 { + return getCPUsFromCgroupV2(getParentV2(group)) } - return cpus, mems, nil + return getCPUsFromCgroupV1(getParentV1(group)) } -// cpusetEnsureParent makes sure that the parent directories of current -// are created and populated with the proper cpus and mems files copied -// from their respective parent. It does that recursively, starting from -// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). -func cpusetEnsureParent(current string) error { - var st unix.Statfs_t +// CgroupScope returns the name of the scope for Nomad's managed cgroups for +// the given allocID and task. +// +// e.g. "-.scope" +// +// Only useful for v2. +func CgroupScope(allocID, task string) string { + return fmt.Sprintf("%s.%s.scope", allocID, task) +} - parent := filepath.Dir(current) - err := unix.Statfs(parent, &st) - if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { +// ConfigureBasicCgroups will initialize cgroups for v1. +// +// Not useful in cgroups.v2 +func ConfigureBasicCgroups(config *lcc.Config) error { + if UseV2 { + // In v2 the default behavior is to create inherited interface files for + // all mounted subsystems automatically. return nil } - // Treat non-existing directory as cgroupfs as it will be created, - // and the root cpuset directory obviously exists. - if err != nil && err != unix.ENOENT { - return &os.PathError{Op: "statfs", Path: parent, Err: err} - } - if err := cpusetEnsureParent(parent); err != nil { - return err - } - if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) { - return err - } - return cpusetCopyIfNeeded(current, parent) -} - -// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent -// directory to the current directory if the file's contents are 0 -func cpusetCopyIfNeeded(current, parent string) error { - currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + id := uuid.Generate() + // In v1 we must setup the freezer cgroup ourselves. + subsystem := "freezer" + path, err := GetCgroupPathHelperV1(subsystem, filepath.Join(DefaultCgroupV1Parent, id)) if err != nil { - return err + return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err) } - parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) - if err != nil { + if err = os.MkdirAll(path, 0755); err != nil { return err } - - if isEmptyCpuset(currentCpus) { - if err := fscommon.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { - return err - } - } - if isEmptyCpuset(currentMems) { - if err := fscommon.WriteFile(current, "cpuset.mems", parentMems); err != nil { - return err - } + config.Cgroups.Paths = map[string]string{ + subsystem: path, } return nil } -func isEmptyCpuset(str string) bool { - return str == "" || str == "\n" -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - // FindCgroupMountpointDir is used to find the cgroup mount point on a Linux // system. +// +// Note that in cgroups.v1, this returns one of many subsystems that are mounted. +// e.g. a return value of "/sys/fs/cgroup/systemd" really implies the root is +// "/sys/fs/cgroup", which is interesting on hybrid systems where the 'unified' +// subsystem is mounted as if it were a subsystem, but the actual root is different. +// (i.e. /sys/fs/cgroup/unified). +// +// As far as Nomad is concerned, UseV2 is the source of truth for which hierarchy +// to use, and that will only be a true value if cgroups.v2 is mounted on +// /sys/fs/cgroup (i.e. system is not in v1 or hybrid mode). +// +// ➜ mount -l | grep cgroup +// tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,mode=755,inode64) +// cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate) +// cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) +// cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) +// (etc.) func FindCgroupMountpointDir() (string, error) { mount, err := cgroups.GetCgroupMounts(false) if err != nil { @@ -127,3 +112,18 @@ func FindCgroupMountpointDir() (string, error) { } return mount[0].Mountpoint, nil } + +// CopyCpuset copies the cpuset.cpus value from source into destination. +func CopyCpuset(source, destination string) error { + correct, err := cgroups.ReadFile(source, "cpuset.cpus") + if err != nil { + return err + } + + err = cgroups.WriteFile(destination, "cpuset.cpus", correct) + if err != nil { + return err + } + + return nil +} diff --git a/client/lib/cgutil/cgutil_linux_test.go b/client/lib/cgutil/cgutil_linux_test.go new file mode 100644 index 000000000000..bbc18ef8c3e9 --- /dev/null +++ b/client/lib/cgutil/cgutil_linux_test.go @@ -0,0 +1,124 @@ +//go:build linux + +package cgutil + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/stretchr/testify/require" +) + +func TestUtil_GetCgroupParent(t *testing.T) { + ci.Parallel(t) + + t.Run("v1", func(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + t.Run("default", func(t *testing.T) { + exp := "/nomad" + parent := GetCgroupParent("") + require.Equal(t, exp, parent) + }) + + t.Run("configured", func(t *testing.T) { + exp := "/bar" + parent := GetCgroupParent("/bar") + require.Equal(t, exp, parent) + }) + }) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + t.Run("default", func(t *testing.T) { + exp := "nomad.slice" + parent := GetCgroupParent("") + require.Equal(t, exp, parent) + }) + + t.Run("configured", func(t *testing.T) { + exp := "abc.slice" + parent := GetCgroupParent("abc.slice") + require.Equal(t, exp, parent) + }) + }) +} + +func TestUtil_CreateCPUSetManager(t *testing.T) { + ci.Parallel(t) + + logger := testlog.HCLogger(t) + + t.Run("v1", func(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + parent := "/" + uuid.Short() + manager := CreateCPUSetManager(parent, logger) + err := manager.Init([]uint16{0}) + require.NoError(t, err) + require.NoError(t, cgroups.RemovePath(filepath.Join(CgroupRoot, parent))) + }) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + parent := uuid.Short() + ".slice" + manager := CreateCPUSetManager(parent, logger) + err := manager.Init([]uint16{0}) + require.NoError(t, err) + require.NoError(t, cgroups.RemovePath(filepath.Join(CgroupRoot, parent))) + }) +} + +func TestUtil_GetCPUsFromCgroup(t *testing.T) { + ci.Parallel(t) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + cpus, err := GetCPUsFromCgroup("system.slice") // thanks, systemd! + require.NoError(t, err) + require.NotEmpty(t, cpus) + }) +} + +func create(t *testing.T, name string) { + mgr, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, name), rootless) + require.NoError(t, err) + err = mgr.Apply(CreationPID) + require.NoError(t, err) +} + +func cleanup(t *testing.T, name string) { + err := cgroups.RemovePath(name) + require.NoError(t, err) +} + +func TestUtil_CopyCpuset(t *testing.T) { + ci.Parallel(t) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + source := uuid.Short() + ".scope" + create(t, source) + defer cleanup(t, source) + require.NoError(t, cgroups.WriteFile(filepath.Join(CgroupRoot, source), "cpuset.cpus", "0-1")) + + destination := uuid.Short() + ".scope" + create(t, destination) + defer cleanup(t, destination) + + err := CopyCpuset( + filepath.Join(CgroupRoot, source), + filepath.Join(CgroupRoot, destination), + ) + require.NoError(t, err) + + value, readErr := cgroups.ReadFile(filepath.Join(CgroupRoot, destination), "cpuset.cpus") + require.NoError(t, readErr) + require.Equal(t, "0-1", strings.TrimSpace(value)) + }) +} diff --git a/client/lib/cgutil/cgutil_noop.go b/client/lib/cgutil/cgutil_noop.go new file mode 100644 index 000000000000..91a35b14492e --- /dev/null +++ b/client/lib/cgutil/cgutil_noop.go @@ -0,0 +1,42 @@ +//go:build !linux + +package cgutil + +import ( + "github.com/hashicorp/go-hclog" +) + +const ( + // DefaultCgroupParent does not apply to non-Linux operating systems. + DefaultCgroupParent = "" +) + +// UseV2 is always false on non-Linux systems. +// +// This is a read-only value. +var UseV2 = false + +// CreateCPUSetManager creates a no-op CpusetManager for non-Linux operating systems. +func CreateCPUSetManager(string, hclog.Logger) CpusetManager { + return new(NoopCpusetManager) +} + +// FindCgroupMountpointDir returns nothing for non-Linux operating systems. +func FindCgroupMountpointDir() (string, error) { + return "", nil +} + +// GetCgroupParent returns nothing for non-Linux operating systems. +func GetCgroupParent(string) string { + return DefaultCgroupParent +} + +// GetCPUsFromCgroup returns nothing for non-Linux operating systems. +func GetCPUsFromCgroup(string) ([]uint16, error) { + return nil, nil +} + +// CgroupScope returns nothing for non-Linux operating systems. +func CgroupScope(allocID, task string) string { + return "" +} diff --git a/client/lib/cgutil/cpuset_manager.go b/client/lib/cgutil/cpuset_manager.go index 809af90fcbf1..7d16c752f84c 100644 --- a/client/lib/cgutil/cpuset_manager.go +++ b/client/lib/cgutil/cpuset_manager.go @@ -2,18 +2,26 @@ package cgutil import ( "context" + "fmt" + "path/filepath" + "strings" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/structs" ) -// CpusetManager is used to setup cpuset cgroups for each task. A pool of shared cpus is managed for -// tasks which don't require any reserved cores and a cgroup is managed seperetly for each task which -// require reserved cores. +const ( + // CgroupRoot is hard-coded in the cgroups specification. + // It only applies to linux but helpers have references to it in driver(s). + CgroupRoot = "/sys/fs/cgroup" +) + +// CpusetManager is used to setup cpuset cgroups for each task. type CpusetManager interface { - // Init should be called before any tasks are managed to ensure the cgroup parent exists and - // check that proper permissions are granted to manage cgroups. - Init() error + // Init should be called with the initial set of reservable cores before any + // allocations are managed. Ensures the parent cgroup exists and proper permissions + // are available for managing cgroups. + Init([]uint16) error // AddAlloc adds an allocation to the manager AddAlloc(alloc *structs.Allocation) @@ -26,8 +34,26 @@ type CpusetManager interface { CgroupPathFor(allocID, taskName string) CgroupPathGetter } -// CgroupPathGetter is a function which returns the cgroup path and any error which ocured during cgroup initialization. -// It should block until the cgroup has been created or an error is reported +type NoopCpusetManager struct{} + +func (n NoopCpusetManager) Init([]uint16) error { + return nil +} + +func (n NoopCpusetManager) AddAlloc(alloc *structs.Allocation) { +} + +func (n NoopCpusetManager) RemoveAlloc(allocID string) { +} + +func (n NoopCpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { + return func(context.Context) (string, error) { return "", nil } +} + +// CgroupPathGetter is a function which returns the cgroup path and any error which +// occurred during cgroup initialization. +// +// It should block until the cgroup has been created or an error is reported. type CgroupPathGetter func(context.Context) (path string, err error) type TaskCgroupInfo struct { @@ -37,20 +63,25 @@ type TaskCgroupInfo struct { Error error } -func NoopCpusetManager() CpusetManager { return noopCpusetManager{} } - -type noopCpusetManager struct{} +// identity is the "." string that uniquely identifies an +// individual instance of a task within the flat cgroup namespace +type identity string -func (n noopCpusetManager) Init() error { - return nil -} - -func (n noopCpusetManager) AddAlloc(alloc *structs.Allocation) { +func makeID(allocID, task string) identity { + return identity(fmt.Sprintf("%s.%s", allocID, task)) } -func (n noopCpusetManager) RemoveAlloc(allocID string) { +func makeScope(id identity) string { + return string(id) + ".scope" } -func (n noopCpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { - return func(context.Context) (string, error) { return "", nil } +// SplitPath determines the parent and cgroup from p. +// p must contain at least 2 elements (parent + cgroup). +// +// Handles the cgroup root if present. +func SplitPath(p string) (string, string) { + p = strings.TrimPrefix(p, CgroupRoot) + p = strings.Trim(p, "/") + parts := strings.Split(p, "/") + return parts[0], "/" + filepath.Join(parts[1:]...) } diff --git a/client/lib/cgutil/cpuset_manager_default.go b/client/lib/cgutil/cpuset_manager_default.go deleted file mode 100644 index 1f8c077baa6d..000000000000 --- a/client/lib/cgutil/cpuset_manager_default.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build !linux -// +build !linux - -package cgutil - -import ( - "github.com/hashicorp/go-hclog" -) - -func NewCpusetManager(_ string, _ hclog.Logger) CpusetManager { return noopCpusetManager{} } diff --git a/client/lib/cgutil/cpuset_manager_test.go b/client/lib/cgutil/cpuset_manager_test.go new file mode 100644 index 000000000000..88f661411d26 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_test.go @@ -0,0 +1,28 @@ +package cgutil + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/stretchr/testify/require" +) + +func TestUtil_SplitPath(t *testing.T) { + ci.Parallel(t) + + try := func(input, expParent, expCgroup string) { + parent, cgroup := SplitPath(input) + require.Equal(t, expParent, parent) + require.Equal(t, expCgroup, cgroup) + } + + // foo, /bar + try("foo/bar", "foo", "/bar") + try("/foo/bar/", "foo", "/bar") + try("/sys/fs/cgroup/foo/bar", "foo", "/bar") + + // foo, /bar/baz + try("/foo/bar/baz/", "foo", "/bar/baz") + try("foo/bar/baz", "foo", "/bar/baz") + try("/sys/fs/cgroup/foo/bar/baz", "foo", "/bar/baz") +} diff --git a/client/lib/cgutil/cpuset_manager_linux.go b/client/lib/cgutil/cpuset_manager_v1.go similarity index 59% rename from client/lib/cgutil/cpuset_manager_linux.go rename to client/lib/cgutil/cpuset_manager_v1.go index ba583c043c22..0e910705290e 100644 --- a/client/lib/cgutil/cpuset_manager_linux.go +++ b/client/lib/cgutil/cpuset_manager_v1.go @@ -1,3 +1,5 @@ +//go:build linux + package cgutil import ( @@ -10,21 +12,28 @@ import ( "sync" "time" - "github.com/opencontainers/runc/libcontainer/cgroups" - cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/structs" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +const ( + DefaultCgroupV1Parent = "/nomad" + SharedCpusetCgroupName = "shared" + ReservedCpusetCgroupName = "reserved" ) -func NewCpusetManager(cgroupParent string, logger hclog.Logger) CpusetManager { +// NewCpusetManagerV1 creates a CpusetManager compatible with cgroups.v1 +func NewCpusetManagerV1(cgroupParent string, logger hclog.Logger) CpusetManager { if cgroupParent == "" { - cgroupParent = DefaultCgroupParent + cgroupParent = DefaultCgroupV1Parent } - return &cpusetManager{ + return &cpusetManagerV1{ cgroupParent: cgroupParent, cgroupInfo: map[string]allocTaskCgroupInfo{}, logger: logger, @@ -35,7 +44,7 @@ var ( cpusetReconcileInterval = 30 * time.Second ) -type cpusetManager struct { +type cpusetManagerV1 struct { // cgroupParent relative to the cgroup root. ex. '/nomad' cgroupParent string // cgroupParentPath is the absolute path to the cgroup parent. @@ -53,7 +62,7 @@ type cpusetManager struct { logger hclog.Logger } -func (c *cpusetManager) AddAlloc(alloc *structs.Allocation) { +func (c *cpusetManagerV1) AddAlloc(alloc *structs.Allocation) { if alloc == nil || alloc.AllocatedResources == nil { return } @@ -77,14 +86,14 @@ func (c *cpusetManager) AddAlloc(alloc *structs.Allocation) { go c.signalReconcile() } -func (c *cpusetManager) RemoveAlloc(allocID string) { +func (c *cpusetManagerV1) RemoveAlloc(allocID string) { c.mu.Lock() delete(c.cgroupInfo, allocID) c.mu.Unlock() go c.signalReconcile() } -func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { +func (c *cpusetManagerV1) CgroupPathFor(allocID, task string) CgroupPathGetter { return func(ctx context.Context) (string, error) { c.mu.Lock() allocInfo, ok := c.cgroupInfo[allocID] @@ -99,15 +108,21 @@ func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { return "", fmt.Errorf("task %q not found", task) } + timer, stop := helper.NewSafeTimer(0) + defer stop() + for { + if taskInfo.Error != nil { break } + + timer.Reset(100 * time.Millisecond) if _, err := os.Stat(taskInfo.CgroupPath); os.IsNotExist(err) { select { case <-ctx.Done(): return taskInfo.CgroupPath, ctx.Err() - case <-time.After(100 * time.Millisecond): + case <-timer.C: continue } } @@ -119,24 +134,25 @@ func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { } +// task name -> task cgroup info type allocTaskCgroupInfo map[string]*TaskCgroupInfo // Init checks that the cgroup parent and expected child cgroups have been created // If the cgroup parent is set to /nomad then this will ensure that the /nomad/shared // cgroup is initialized. -func (c *cpusetManager) Init() error { - cgroupParentPath, err := getCgroupPathHelper("cpuset", c.cgroupParent) +func (c *cpusetManagerV1) Init(_ []uint16) error { + cgroupParentPath, err := GetCgroupPathHelperV1("cpuset", c.cgroupParent) if err != nil { return err } c.cgroupParentPath = cgroupParentPath // ensures that shared cpuset exists and that the cpuset values are copied from the parent if created - if err := cpusetEnsureParent(filepath.Join(cgroupParentPath, SharedCpusetCgroupName)); err != nil { + if err := cpusetEnsureParentV1(filepath.Join(cgroupParentPath, SharedCpusetCgroupName)); err != nil { return err } - parentCpus, parentMems, err := getCpusetSubsystemSettings(cgroupParentPath) + parentCpus, parentMems, err := getCpusetSubsystemSettingsV1(cgroupParentPath) if err != nil { return fmt.Errorf("failed to detect parent cpuset settings: %v", err) } @@ -155,7 +171,7 @@ func (c *cpusetManager) Init() error { return err } - if err := fscommon.WriteFile(filepath.Join(cgroupParentPath, ReservedCpusetCgroupName), "cpuset.mems", parentMems); err != nil { + if err := cgroups.WriteFile(filepath.Join(cgroupParentPath, ReservedCpusetCgroupName), "cpuset.mems", parentMems); err != nil { return err } @@ -168,7 +184,7 @@ func (c *cpusetManager) Init() error { return nil } -func (c *cpusetManager) reconcileLoop() { +func (c *cpusetManagerV1) reconcileLoop() { timer := time.NewTimer(0) if !timer.Stop() { <-timer.C @@ -189,7 +205,7 @@ func (c *cpusetManager) reconcileLoop() { } } -func (c *cpusetManager) reconcileCpusets() { +func (c *cpusetManagerV1) reconcileCpusets() { c.mu.Lock() defer c.mu.Unlock() sharedCpuset := cpuset.New(c.parentCpuset.ToSlice()...) @@ -240,13 +256,13 @@ func (c *cpusetManager) reconcileCpusets() { } // copy cpuset.mems from parent - _, parentMems, err := getCpusetSubsystemSettings(filepath.Dir(info.CgroupPath)) + _, parentMems, err := getCpusetSubsystemSettingsV1(filepath.Dir(info.CgroupPath)) if err != nil { c.logger.Error("failed to read parent cgroup settings for task", "path", info.CgroupPath, "error", err) info.Error = err continue } - if err := fscommon.WriteFile(info.CgroupPath, "cpuset.mems", parentMems); err != nil { + if err := cgroups.WriteFile(info.CgroupPath, "cpuset.mems", parentMems); err != nil { c.logger.Error("failed to write cgroup cpuset.mems setting for task", "path", info.CgroupPath, "mems", parentMems, "error", err) info.Error = err continue @@ -260,30 +276,30 @@ func (c *cpusetManager) reconcileCpusets() { } // setCgroupCpusetCPUs will compare an existing cpuset.cpus value with an expected value, overwriting the existing if different -// must hold a lock on cpusetManager.mu before calling -func (_ *cpusetManager) setCgroupCpusetCPUs(path, cpus string) error { - currentCpusRaw, err := fscommon.ReadFile(path, "cpuset.cpus") +// must hold a lock on cpusetManagerV1.mu before calling +func (_ *cpusetManagerV1) setCgroupCpusetCPUs(path, cpus string) error { + currentCpusRaw, err := cgroups.ReadFile(path, "cpuset.cpus") if err != nil { return err } if cpus != strings.TrimSpace(currentCpusRaw) { - if err := fscommon.WriteFile(path, "cpuset.cpus", cpus); err != nil { + if err := cgroups.WriteFile(path, "cpuset.cpus", cpus); err != nil { return err } } return nil } -func (c *cpusetManager) signalReconcile() { +func (c *cpusetManagerV1) signalReconcile() { select { case c.signalCh <- struct{}{}: case <-c.doneCh: } } -func (c *cpusetManager) getCpuset(group string) (cpuset.CPUSet, error) { - man := cgroupFs.NewManager( +func (c *cpusetManagerV1) getCpuset(group string) (cpuset.CPUSet, error) { + man := fs.NewManager( &configs.Cgroup{ Path: filepath.Join(c.cgroupParent, group), }, @@ -297,15 +313,119 @@ func (c *cpusetManager) getCpuset(group string) (cpuset.CPUSet, error) { return cpuset.New(stats.CPUSetStats.CPUs...), nil } -func (c *cpusetManager) getCgroupPathsForTask(allocID, task string) (absolute, relative string) { +func (c *cpusetManagerV1) getCgroupPathsForTask(allocID, task string) (absolute, relative string) { return filepath.Join(c.reservedCpusetPath(), fmt.Sprintf("%s-%s", allocID, task)), filepath.Join(c.cgroupParent, ReservedCpusetCgroupName, fmt.Sprintf("%s-%s", allocID, task)) } -func (c *cpusetManager) sharedCpusetPath() string { +func (c *cpusetManagerV1) sharedCpusetPath() string { return filepath.Join(c.cgroupParentPath, SharedCpusetCgroupName) } -func (c *cpusetManager) reservedCpusetPath() string { +func (c *cpusetManagerV1) reservedCpusetPath() string { return filepath.Join(c.cgroupParentPath, ReservedCpusetCgroupName) } + +func getCPUsFromCgroupV1(group string) ([]uint16, error) { + cgroupPath, err := GetCgroupPathHelperV1("cpuset", group) + if err != nil { + return nil, err + } + + man := fs.NewManager(&configs.Cgroup{Path: group}, map[string]string{"cpuset": cgroupPath}, false) + stats, err := man.GetStats() + if err != nil { + return nil, err + } + return stats.CPUSetStats.CPUs, nil +} + +func getParentV1(parent string) string { + if parent == "" { + return DefaultCgroupV1Parent + } + return parent +} + +// cpusetEnsureParentV1 makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParentV1(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParentV1(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeededV1(current, parent) +} + +// cpusetCopyIfNeededV1 copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeededV1(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettingsV1(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettingsV1(parent) + if err != nil { + return err + } + + if isEmptyCpusetV1(currentCpus) { + if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { + return err + } + } + if isEmptyCpusetV1(currentMems) { + if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { + return err + } + } + return nil +} + +func getCpusetSubsystemSettingsV1(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +func isEmptyCpusetV1(str string) bool { + return str == "" || str == "\n" +} + +func GetCgroupPathHelperV1(subsystem, cgroup string) (string, error) { + mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + result := filepath.Join(mnt, relCgroup) + return result, nil +} diff --git a/client/lib/cgutil/cpuset_manager_linux_test.go b/client/lib/cgutil/cpuset_manager_v1_test.go similarity index 80% rename from client/lib/cgutil/cpuset_manager_linux_test.go rename to client/lib/cgutil/cpuset_manager_v1_test.go index e2bb00679c36..3ada7b551a44 100644 --- a/client/lib/cgutil/cpuset_manager_linux_test.go +++ b/client/lib/cgutil/cpuset_manager_v1_test.go @@ -1,51 +1,48 @@ +//go:build linux + package cgutil import ( "io/ioutil" "path/filepath" - "runtime" - "syscall" "testing" + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/mock" "github.com/opencontainers/runc/libcontainer/cgroups" - - "github.com/hashicorp/nomad/helper/uuid" - "github.com/stretchr/testify/require" - - "github.com/hashicorp/nomad/helper/testlog" ) -func tmpCpusetManager(t *testing.T) (manager *cpusetManager, cleanup func()) { - if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { - t.Skip("Test only available running as root on linux") - } +func tmpCpusetManagerV1(t *testing.T) (manager *cpusetManagerV1, cleanup func()) { mount, err := FindCgroupMountpointDir() if err != nil || mount == "" { t.Skipf("Failed to find cgroup mount: %v %v", mount, err) } parent := "/gotest-" + uuid.Short() - require.NoError(t, cpusetEnsureParent(parent)) + require.NoError(t, cpusetEnsureParentV1(parent)) - manager = &cpusetManager{ + manager = &cpusetManagerV1{ cgroupParent: parent, cgroupInfo: map[string]allocTaskCgroupInfo{}, logger: testlog.HCLogger(t), } - parentPath, err := getCgroupPathHelper("cpuset", parent) + parentPath, err := GetCgroupPathHelperV1("cpuset", parent) require.NoError(t, err) return manager, func() { require.NoError(t, cgroups.RemovePaths(map[string]string{"cpuset": parentPath})) } } -func TestCpusetManager_Init(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) +func TestCpusetManager_V1_Init(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) + require.NoError(t, manager.Init(nil)) require.DirExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName)) require.FileExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName, "cpuset.cpus")) @@ -57,10 +54,12 @@ func TestCpusetManager_Init(t *testing.T) { require.DirExists(t, filepath.Join(manager.cgroupParentPath, ReservedCpusetCgroupName)) } -func TestCpusetManager_AddAlloc_single(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) +func TestCpusetManager_V1_AddAlloc_single(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) + require.NoError(t, manager.Init(nil)) alloc := mock.Alloc() // reserve just one core (the 0th core, which probably exists) @@ -104,26 +103,19 @@ func TestCpusetManager_AddAlloc_single(t *testing.T) { require.Exactly(t, alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores, taskCpus.ToSlice()) } -func TestCpusetManager_AddAlloc_subset(t *testing.T) { - t.Skip("todo: add test for #11933") -} +func TestCpusetManager_V1_RemoveAlloc(t *testing.T) { + testutil.CgroupsCompatibleV1(t) -func TestCpusetManager_AddAlloc_all(t *testing.T) { - // cgroupsv2 changes behavior of writing empty cpuset.cpu, which is what - // happens to the /shared group when one or more allocs consume all available - // cores. - t.Skip("todo: add test for #11933") -} - -func TestCpusetManager_RemoveAlloc(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) - - // this case tests adding 2 allocs, reconciling then removing 1 alloc - // it requires the system to have atleast 2 cpu cores (one for each alloc) - if manager.parentCpuset.Size() < 2 { - t.Skip("test requires atleast 2 cpu cores") + require.NoError(t, manager.Init(nil)) + + // This case tests adding 2 allocs, reconciling then removing 1 alloc + // it requires the system to have at least 3 cpu cores (one for each alloc), + // BUT plus another one because writing an empty cpuset causes the cgroup to + // inherit the parent. + if manager.parentCpuset.Size() < 3 { + t.Skip("test requires at least 3 cpu cores") } alloc1 := mock.Alloc() diff --git a/client/lib/cgutil/cpuset_manager_v2.go b/client/lib/cgutil/cpuset_manager_v2.go new file mode 100644 index 000000000000..00162ca523a5 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_v2.go @@ -0,0 +1,332 @@ +//go:build linux + +package cgutil + +import ( + "context" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/lib/cpuset" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + // CreationPID is a special PID in libcontainer used to denote a cgroup + // should be created, but with no process added. + // + // https://github.com/opencontainers/runc/blob/v1.0.3/libcontainer/cgroups/utils.go#L372 + CreationPID = -1 + + // DefaultCgroupParentV2 is the name of Nomad's default parent cgroup, under which + // all other cgroups are managed. This can be changed with client configuration + // in case for e.g. Nomad tasks should be further constrained by an externally + // configured systemd cgroup. + DefaultCgroupParentV2 = "nomad.slice" + + // rootless is (for now) always false; Nomad clients require root, so we + // assume to not need to do the extra plumbing for rootless cgroups. + rootless = false +) + +// nothing is used for treating a map like a set with no values +type nothing struct{} + +// null represents nothing +var null = nothing{} + +type cpusetManagerV2 struct { + logger hclog.Logger + + parent string // relative to cgroup root (e.g. "nomad.slice") + parentAbs string // absolute path (e.g. "/sys/fs/cgroup/nomad.slice") + initial cpuset.CPUSet // set of initial cores (never changes) + + lock sync.Mutex // hold this when managing pool / sharing / isolating + pool cpuset.CPUSet // pool of cores being shared among all tasks + sharing map[identity]nothing // sharing tasks using cores only from the pool + isolating map[identity]cpuset.CPUSet // isolating tasks using cores from the pool + reserved cores +} + +func NewCpusetManagerV2(parent string, logger hclog.Logger) CpusetManager { + cgroupParent := getParentV2(parent) + return &cpusetManagerV2{ + parent: cgroupParent, + parentAbs: filepath.Join(CgroupRoot, cgroupParent), + logger: logger, + sharing: make(map[identity]nothing), + isolating: make(map[identity]cpuset.CPUSet), + } +} + +func (c *cpusetManagerV2) Init(cores []uint16) error { + c.logger.Debug("initializing with", "cores", cores) + if err := c.ensureParent(); err != nil { + c.logger.Error("failed to init cpuset manager", "err", err) + return err + } + c.initial = cpuset.New(cores...) + return nil +} + +func (c *cpusetManagerV2) AddAlloc(alloc *structs.Allocation) { + if alloc == nil || alloc.AllocatedResources == nil { + return + } + c.logger.Trace("add allocation", "name", alloc.Name, "id", alloc.ID) + + // grab write lock while we recompute and apply changes + c.lock.Lock() + defer c.lock.Unlock() + + // first update our tracking of isolating and sharing tasks + for task, resources := range alloc.AllocatedResources.Tasks { + id := makeID(alloc.ID, task) + if len(resources.Cpu.ReservedCores) > 0 { + c.isolating[id] = cpuset.New(resources.Cpu.ReservedCores...) + } else { + c.sharing[id] = null + } + } + + // recompute the available sharable cpu cores + c.recalculate() + + // now write out the entire cgroups space + c.reconcile() + + // no need to cleanup on adds, we did not remove a task +} + +func (c *cpusetManagerV2) RemoveAlloc(allocID string) { + c.logger.Trace("remove allocation", "id", allocID) + + // grab write lock while we recompute and apply changes. + c.lock.Lock() + defer c.lock.Unlock() + + // remove tasks of allocID from the sharing set + for id := range c.sharing { + if strings.HasPrefix(string(id), allocID) { + delete(c.sharing, id) + } + } + + // remove tasks of allocID from the isolating set + for id := range c.isolating { + if strings.HasPrefix(string(id), allocID) { + delete(c.isolating, id) + } + } + + // recompute available sharable cpu cores + c.recalculate() + + // now write out the entire cgroups space + c.reconcile() + + // now remove any tasks no longer running + c.cleanup() +} + +func (c *cpusetManagerV2) CgroupPathFor(allocID, task string) CgroupPathGetter { + // The CgroupPathFor implementation must block until cgroup for allocID.task + // exists [and can accept a PID]. + return func(ctx context.Context) (string, error) { + ticks, cancel := helper.NewSafeTimer(100 * time.Millisecond) + defer cancel() + + for { + path := c.pathOf(makeID(allocID, task)) + mgr, err := fs2.NewManager(nil, path, rootless) + if err != nil { + return "", err + } + + if mgr.Exists() { + return path, nil + } + + select { + case <-ctx.Done(): + return "", ctx.Err() + case <-ticks.C: + continue + } + } + } +} + +// recalculate the number of cores sharable by non-isolating tasks (and isolating tasks) +// +// must be called while holding c.lock +func (c *cpusetManagerV2) recalculate() { + remaining := c.initial.Copy() + for _, set := range c.isolating { + remaining = remaining.Difference(set) + } + c.pool = remaining +} + +// reconcile will actually write the cpuset values for all tracked tasks. +// +// must be called while holding c.lock +func (c *cpusetManagerV2) reconcile() { + for id := range c.sharing { + c.write(id, c.pool) + } + + for id, set := range c.isolating { + c.write(id, c.pool.Union(set)) + } +} + +// cleanup will remove any cgroups for allocations no longer being tracked +// +// must be called while holding c.lock +func (c *cpusetManagerV2) cleanup() { + // create a map to lookup ids we know about + size := len(c.sharing) + len(c.isolating) + ids := make(map[identity]nothing, size) + for id := range c.sharing { + ids[id] = null + } + for id := range c.isolating { + ids[id] = null + } + + if err := filepath.WalkDir(c.parentAbs, func(path string, entry os.DirEntry, err error) error { + // a cgroup is a directory + if !entry.IsDir() { + return nil + } + + dir := filepath.Dir(path) + base := filepath.Base(path) + + // only manage scopes directly under nomad.slice + if dir != c.parentAbs || !strings.HasSuffix(base, ".scope") { + return nil + } + + // only remove the scope if we do not track it + id := identity(strings.TrimSuffix(base, ".scope")) + _, exists := ids[id] + if !exists { + c.remove(path) + } + + return nil + }); err != nil { + c.logger.Error("failed to cleanup cgroup", "err", err) + } +} + +//pathOf returns the absolute path to a task with identity id. +func (c *cpusetManagerV2) pathOf(id identity) string { + return filepath.Join(c.parentAbs, makeScope(id)) +} + +// remove does the actual fs delete of the cgroup +// +// We avoid removing a cgroup if it still contains a PID, as the cpuset manager +// may be initially empty on a Nomad client restart. +func (c *cpusetManagerV2) remove(path string) { + mgr, err := fs2.NewManager(nil, path, rootless) + if err != nil { + c.logger.Warn("failed to create manager", "path", path, "err", err) + return + } + + // get the list of pids managed by this scope (should be 0 or 1) + pids, _ := mgr.GetPids() + + // do not destroy the scope if a PID is still present + // this is a normal condition when an agent restarts with running tasks + // and the v2 manager is still rebuilding its tracked tasks + if len(pids) > 0 { + return + } + + // remove the cgroup + if err3 := mgr.Destroy(); err3 != nil { + c.logger.Warn("failed to cleanup cgroup", "path", path, "err", err) + return + } +} + +// write does the actual write of cpuset set for cgroup id +func (c *cpusetManagerV2) write(id identity, set cpuset.CPUSet) { + path := c.pathOf(id) + + // make a manager for the cgroup + m, err := fs2.NewManager(nil, path, rootless) + if err != nil { + c.logger.Error("failed to manage cgroup", "path", path, "err", err) + } + + // create the cgroup + if err = m.Apply(CreationPID); err != nil { + c.logger.Error("failed to apply cgroup", "path", path, "err", err) + } + + // set the cpuset value for the cgroup + if err = m.Set(&configs.Resources{ + CpusetCpus: set.String(), + }); err != nil { + c.logger.Error("failed to set cgroup", "path", path, "err", err) + } +} + +// ensureParentCgroup will create parent cgroup for the manager if it does not +// exist yet. No PIDs are added to any cgroup yet. +func (c *cpusetManagerV2) ensureParent() error { + mgr, err := fs2.NewManager(nil, c.parentAbs, rootless) + if err != nil { + return err + } + + if err = mgr.Apply(CreationPID); err != nil { + return err + } + + c.logger.Trace("establish cgroup hierarchy", "parent", c.parent) + return nil +} + +// fromRoot returns the joined filepath of group on the CgroupRoot +func fromRoot(group string) string { + return filepath.Join(CgroupRoot, group) +} + +// getCPUsFromCgroupV2 retrieves the effective cpuset for the group, which must +// be directly under the cgroup root (i.e. the parent, like nomad.slice). +func getCPUsFromCgroupV2(group string) ([]uint16, error) { + path := fromRoot(group) + effective, err := cgroups.ReadFile(path, "cpuset.cpus.effective") + if err != nil { + return nil, err + } + set, err := cpuset.Parse(effective) + if err != nil { + return nil, err + } + return set.ToSlice(), nil +} + +// getParentV2 returns parent if set, otherwise the default name of Nomad's +// parent cgroup (i.e. nomad.slice). +func getParentV2(parent string) string { + if parent == "" { + return DefaultCgroupParentV2 + } + return parent +} diff --git a/client/lib/cgutil/cpuset_manager_v2_test.go b/client/lib/cgutil/cpuset_manager_v2_test.go new file mode 100644 index 000000000000..e1d658c82589 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_v2_test.go @@ -0,0 +1,90 @@ +//go:build linux + +package cgutil + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/lib/cpuset" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/stretchr/testify/require" +) + +// Note: these tests need to run on GitHub Actions runners with only 2 cores. +// It is not possible to write more cores to a cpuset than are actually available, +// so make sure tests account for that by setting systemCores as the full set of +// usable cores. +var systemCores = []uint16{0, 1} + +func TestCpusetManager_V2_AddAlloc(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + + logger := testlog.HCLogger(t) + parent := uuid.Short() + ".scope" + create(t, parent) + cleanup(t, parent) + + // setup the cpuset manager + manager := NewCpusetManagerV2(parent, logger) + require.NoError(t, manager.Init(systemCores)) + + // add our first alloc, isolating 1 core + t.Run("first", func(t *testing.T) { + alloc := mock.Alloc() + alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(0).ToSlice() + manager.AddAlloc(alloc) + cpusetIs(t, "0-1", parent, alloc.ID, "web") + }) + + // add second alloc, isolating 1 core + t.Run("second", func(t *testing.T) { + alloc := mock.Alloc() + alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(1).ToSlice() + manager.AddAlloc(alloc) + cpusetIs(t, "1", parent, alloc.ID, "web") + }) + + // note that the scheduler, not the cpuset manager, is what prevents over-subscription + // and as such no logic exists here to prevent that +} + +func cpusetIs(t *testing.T, exp, parent, allocID, task string) { + scope := makeScope(makeID(allocID, task)) + value, err := cgroups.ReadFile(filepath.Join(CgroupRoot, parent, scope), "cpuset.cpus") + require.NoError(t, err) + require.Equal(t, exp, strings.TrimSpace(value)) +} + +func TestCpusetManager_V2_RemoveAlloc(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + + logger := testlog.HCLogger(t) + parent := uuid.Short() + ".scope" + create(t, parent) + cleanup(t, parent) + + // setup the cpuset manager + manager := NewCpusetManagerV2(parent, logger) + require.NoError(t, manager.Init(systemCores)) + + // alloc1 gets core 0 + alloc1 := mock.Alloc() + alloc1.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(0).ToSlice() + manager.AddAlloc(alloc1) + + // alloc2 gets core 1 + alloc2 := mock.Alloc() + alloc2.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(1).ToSlice() + manager.AddAlloc(alloc2) + cpusetIs(t, "1", parent, alloc2.ID, "web") + + // with alloc1 gone, alloc2 gets the now shared core + manager.RemoveAlloc(alloc1.ID) + cpusetIs(t, "0-1", parent, alloc2.ID, "web") +} diff --git a/client/testutil/driver_compatible.go b/client/testutil/driver_compatible.go index ff0ecc747f34..2459f8aa6c86 100644 --- a/client/testutil/driver_compatible.go +++ b/client/testutil/driver_compatible.go @@ -5,18 +5,18 @@ import ( "runtime" "syscall" "testing" - - "github.com/hashicorp/nomad/client/lib/cgutil" ) -// RequireRoot skips tests unless running on a Unix as root. +// RequireRoot skips tests unless: +// - running as root func RequireRoot(t *testing.T) { if syscall.Geteuid() != 0 { - t.Skip("Must run as root on Unix") + t.Skip("Test requires root") } } -// RequireConsul skips tests unless a Consul binary is available on $PATH. +// RequireConsul skips tests unless: +// - "consul" executable is detected on $PATH func RequireConsul(t *testing.T) { _, err := exec.Command("consul", "version").CombinedOutput() if err != nil { @@ -24,7 +24,8 @@ func RequireConsul(t *testing.T) { } } -// RequireVault skips tests unless a Vault binary is available on $PATH. +// RequireVault skips tests unless: +// - "vault" executable is detected on $PATH func RequireVault(t *testing.T) { _, err := exec.Command("vault", "version").CombinedOutput() if err != nil { @@ -32,19 +33,41 @@ func RequireVault(t *testing.T) { } } +// RequireLinux skips tests unless: +// - running on Linux +func RequireLinux(t *testing.T) { + if runtime.GOOS != "linux" { + t.Skip("Test requires Linux") + } +} + +// ExecCompatible skips tests unless: +// - running as root +// - running on Linux func ExecCompatible(t *testing.T) { if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { - t.Skip("Test only available running as root on linux") + t.Skip("Test requires root on Linux") } - CgroupCompatible(t) } +// JavaCompatible skips tests unless: +// - "java" executable is detected on $PATH +// - running as root +// - running on Linux func JavaCompatible(t *testing.T) { - if runtime.GOOS == "linux" && syscall.Geteuid() != 0 { - t.Skip("Test only available when running as root on linux") + _, err := exec.Command("java", "-version").CombinedOutput() + if err != nil { + t.Skipf("Test requires Java: %v", err) + } + + if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { + t.Skip("Test requires root on Linux") } } +// QemuCompatible skips tests unless: +// - "qemu-system-x86_64" executable is detected on $PATH (!windows) +// - "qemu-img" executable is detected on on $PATH (windows) func QemuCompatible(t *testing.T) { // Check if qemu exists bin := "qemu-system-x86_64" @@ -53,23 +76,19 @@ func QemuCompatible(t *testing.T) { } _, err := exec.Command(bin, "--version").CombinedOutput() if err != nil { - t.Skip("Must have Qemu installed for Qemu specific tests to run") - } -} - -func CgroupCompatible(t *testing.T) { - mount, err := cgutil.FindCgroupMountpointDir() - if err != nil || mount == "" { - t.Skipf("Failed to find cgroup mount: %v %v", mount, err) + t.Skipf("Test requires QEMU (%s)", bin) } } +// MountCompatible skips tests unless: +// - not running as windows +// - running as root func MountCompatible(t *testing.T) { if runtime.GOOS == "windows" { - t.Skip("Windows does not support mount") + t.Skip("Test requires not using Windows") } if syscall.Geteuid() != 0 { - t.Skip("Must be root to run test") + t.Skip("Test requires root") } } diff --git a/client/testutil/driver_compatible_default.go b/client/testutil/driver_compatible_default.go new file mode 100644 index 000000000000..2f7b3cb8a0ed --- /dev/null +++ b/client/testutil/driver_compatible_default.go @@ -0,0 +1,22 @@ +//go:build !linux + +package testutil + +import ( + "testing" +) + +// CgroupsCompatible returns false on non-Linux operating systems. +func CgroupsCompatible(t *testing.T) { + t.Skipf("Test requires cgroups support on Linux") +} + +// CgroupsCompatibleV1 skips tests on non-Linux operating systems. +func CgroupsCompatibleV1(t *testing.T) { + t.Skipf("Test requires cgroup.v1 support on Linux") +} + +// CgroupsCompatibleV2 skips tests on non-Linux operating systems. +func CgroupsCompatibleV2(t *testing.T) { + t.Skipf("Test requires cgroup.v2 support on Linux") +} diff --git a/client/testutil/driver_compatible_linux.go b/client/testutil/driver_compatible_linux.go new file mode 100644 index 000000000000..84ace6015f74 --- /dev/null +++ b/client/testutil/driver_compatible_linux.go @@ -0,0 +1,56 @@ +//go:build linux + +package testutil + +import ( + "runtime" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +// CgroupsCompatible returns true if either cgroups.v1 or cgroups.v2 is supported. +func CgroupsCompatible(t *testing.T) bool { + return cgroupsCompatibleV1(t) || cgroupsCompatibleV2(t) +} + +// CgroupsCompatibleV1 skips tests unless: +// - cgroup.v1 mount point is detected +func CgroupsCompatibleV1(t *testing.T) { + if !cgroupsCompatibleV1(t) { + t.Skipf("Test requires cgroup.v1 support") + } +} + +func cgroupsCompatibleV1(t *testing.T) bool { + if runtime.GOOS != "linux" { + return false + } + + if cgroupsCompatibleV2(t) { + t.Log("No cgroup.v1 mount point: running in cgroup.v2 mode") + return false + } + mount, err := cgroups.GetCgroupMounts(false) + if err != nil { + t.Logf("Unable to detect cgroup.v1 mount point: %v", err) + return false + } + if len(mount) == 0 { + t.Logf("No cgroup.v1 mount point: empty path") + return false + } + return true +} + +// CgroupsCompatibleV2 skips tests unless: +// - cgroup.v2 unified mode is detected +func CgroupsCompatibleV2(t *testing.T) { + if !cgroupsCompatibleV2(t) { + t.Skip("Test requires cgroup.v2 support") + } +} + +func cgroupsCompatibleV2(t *testing.T) bool { + return cgroups.IsCgroup2UnifiedMode() +} diff --git a/command/agent/config.go b/command/agent/config.go index 7ed47aa73d2e..9c062b1a9d53 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -1763,6 +1763,11 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { if b.BindWildcardDefaultHostNetwork { result.BindWildcardDefaultHostNetwork = true } + + if b.CgroupParent != "" { + result.CgroupParent = b.CgroupParent + } + return &result } diff --git a/drivers/docker/config.go b/drivers/docker/config.go index d45d6f24cab5..6194d60f1e46 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -754,7 +754,9 @@ func (d *Driver) SetConfig(c *base.Config) error { d.coordinator = newDockerCoordinator(coordinatorConfig) - d.reconciler = newReconciler(d) + d.danglingReconciler = newReconciler(d) + + d.cpusetFixer = newCpusetFixer(d) return nil } diff --git a/drivers/docker/config_test.go b/drivers/docker/config_test.go index df237a440202..d5f1acafc959 100644 --- a/drivers/docker/config_test.go +++ b/drivers/docker/config_test.go @@ -704,7 +704,7 @@ func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) { func TestConfig_DriverConfig_AllowRuntimes(t *testing.T) { ci.Parallel(t) - + cases := []struct { name string config string diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index f86879ebc475..d6e904204aef 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -20,6 +20,7 @@ import ( hclog "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" plugin "github.com/hashicorp/go-plugin" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/drivers/docker/docklog" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -122,7 +123,8 @@ type Driver struct { detected bool detectedLock sync.RWMutex - reconciler *containerReconciler + danglingReconciler *containerReconciler + cpusetFixer CpusetFixer } // NewDockerDriver returns a docker implementation of a driver plugin @@ -350,9 +352,14 @@ CREATE: container.ID, "container_state", container.State.String()) } - if containerCfg.HostConfig.CPUSet == "" && cfg.Resources.LinuxResources.CpusetCgroupPath != "" { - if err := setCPUSetCgroup(cfg.Resources.LinuxResources.CpusetCgroupPath, container.State.Pid); err != nil { - return nil, nil, fmt.Errorf("failed to set the cpuset cgroup for container: %v", err) + if !cgutil.UseV2 { + // This does not apply to cgroups.v2, which only allows setting the PID + // into exactly 1 group. For cgroups.v2, we use the cpuset fixer to reconcile + // the cpuset value into the cgroups created by docker in the background. + if containerCfg.HostConfig.CPUSet == "" && cfg.Resources.LinuxResources.CpusetCgroupPath != "" { + if err := setCPUSetCgroup(cfg.Resources.LinuxResources.CpusetCgroupPath, container.State.Pid); err != nil { + return nil, nil, fmt.Errorf("failed to set the cpuset cgroup for container: %v", err) + } } } @@ -841,7 +848,12 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T pidsLimit = driverConfig.PidsLimit } + // Extract the cgroup parent from the nomad cgroup (bypass the need for plugin config) + parent, _ := cgutil.SplitPath(task.Resources.LinuxResources.CpusetCgroupPath) + hostConfig := &docker.HostConfig{ + CgroupParent: parent, + Memory: memory, // hard limit MemoryReservation: memoryReservation, // soft limit diff --git a/drivers/docker/driver_darwin.go b/drivers/docker/driver_darwin.go index b0b1c81dfffc..2db02e4bf6d7 100644 --- a/drivers/docker/driver_darwin.go +++ b/drivers/docker/driver_darwin.go @@ -1,3 +1,5 @@ +//go:build darwin + package docker func setCPUSetCgroup(path string, pid int) error { diff --git a/drivers/docker/driver_darwin_test.go b/drivers/docker/driver_darwin_test.go index 18bfddd6828c..628e89a3e93e 100644 --- a/drivers/docker/driver_darwin_test.go +++ b/drivers/docker/driver_darwin_test.go @@ -1,3 +1,5 @@ +//go:build darwin + package docker import ( diff --git a/drivers/docker/driver_default.go b/drivers/docker/driver_default.go index 8607b899b199..b7bef7173408 100644 --- a/drivers/docker/driver_default.go +++ b/drivers/docker/driver_default.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package docker diff --git a/drivers/docker/driver_linux.go b/drivers/docker/driver_linux.go index b10d3241a670..97cf680b840d 100644 --- a/drivers/docker/driver_linux.go +++ b/drivers/docker/driver_linux.go @@ -1,3 +1,5 @@ +//go:build linux + package docker import ( @@ -7,7 +9,7 @@ import ( ) func setCPUSetCgroup(path string, pid int) error { - // Sometimes the container exists before we can write the + // Sometimes the container exits before we can write the // cgroup resulting in an error which can be ignored. err := cgroups.WriteCgroupProc(path, pid) if err != nil && strings.Contains(err.Error(), "no such process") { diff --git a/drivers/docker/driver_linux_test.go b/drivers/docker/driver_linux_test.go index 006517b28fef..88c83fbba5cb 100644 --- a/drivers/docker/driver_linux_test.go +++ b/drivers/docker/driver_linux_test.go @@ -1,3 +1,5 @@ +//go:build linux + package docker import ( diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 26cbf99161d2..cdffca518ae1 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -1507,9 +1507,7 @@ func TestDockerDriver_Init(t *testing.T) { func TestDockerDriver_CPUSetCPUs(t *testing.T) { ci.Parallel(t) testutil.DockerCompatible(t) - if runtime.GOOS == "windows" { - t.Skip("Windows does not support CPUSetCPUs.") - } + testutil.CgroupsCompatible(t) testCases := []struct { Name string diff --git a/drivers/docker/driver_unix_test.go b/drivers/docker/driver_unix_test.go index fb4fbba254e6..7f019af51614 100644 --- a/drivers/docker/driver_unix_test.go +++ b/drivers/docker/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package docker diff --git a/drivers/docker/driver_windows.go b/drivers/docker/driver_windows.go index 7eda62386f08..f3a4e22ec162 100644 --- a/drivers/docker/driver_windows.go +++ b/drivers/docker/driver_windows.go @@ -1,3 +1,5 @@ +//go:build windows + package docker import docker "github.com/fsouza/go-dockerclient" diff --git a/drivers/docker/driver_windows_test.go b/drivers/docker/driver_windows_test.go index f54b83f20956..765917231862 100644 --- a/drivers/docker/driver_windows_test.go +++ b/drivers/docker/driver_windows_test.go @@ -1,5 +1,4 @@ //go:build windows -// +build windows package docker diff --git a/drivers/docker/fingerprint.go b/drivers/docker/fingerprint.go index be5fd19c0594..f37a169788e1 100644 --- a/drivers/docker/fingerprint.go +++ b/drivers/docker/fingerprint.go @@ -13,9 +13,10 @@ import ( ) func (d *Driver) Fingerprint(ctx context.Context) (<-chan *drivers.Fingerprint, error) { - // start reconciler when we start fingerprinting - // this is the only method called when driver is launched properly - d.reconciler.Start() + // Start docker reconcilers when we start fingerprinting, a workaround for + // task drivers not having a kind of post-setup hook. + d.danglingReconciler.Start() + d.cpusetFixer.Start() ch := make(chan *drivers.Fingerprint) go d.handleFingerprint(ctx, ch) diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 7913796324a7..d4fc19601bd8 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -244,6 +244,12 @@ func (h *taskHandle) run() { } else if container.State.OOMKilled { oom = true werr = fmt.Errorf("OOM Killed") + } else if container.State.ExitCode == 137 { + // With cgroups.v2 it seems the cgroup OOM killer is not observed by docker + // container status. So just fudge the connection for now. + // [Mon Mar 21 19:48:21 2022] Memory cgroup out of memory: Killed process 92768 (sh) [...] + oom = true + werr = fmt.Errorf("OOM Killed (137)") } // Shutdown stats collection diff --git a/drivers/docker/reconcile_cpuset.go b/drivers/docker/reconcile_cpuset.go new file mode 100644 index 000000000000..4eab59ba00d9 --- /dev/null +++ b/drivers/docker/reconcile_cpuset.go @@ -0,0 +1,125 @@ +//go:build linux + +package docker + +import ( + "context" + "fmt" + "path/filepath" + "sync" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/lib/cgutil" + "github.com/hashicorp/nomad/helper" +) + +const ( + cpusetReconcileInterval = 1 * time.Second +) + +type CpusetFixer interface { + Start() +} + +// cpusetFixer adjusts the cpuset.cpus cgroup value to the assigned value by Nomad. +// +// Due to Docker not allowing the configuration of the full cgroup path, we must +// manually fix the cpuset values for all docker containers continuously, as the +// values will change as tasks of any driver using reserved cores are started and +// stopped, changing the size of the remaining shared cpu pool. +// +// The exec/java, podman, and containerd runtimes let you specify the cgroup path, +// making use of the cgroup Nomad creates and manages on behalf of the task. +type cpusetFixer struct { + ctx context.Context + logger hclog.Logger + interval time.Duration + once sync.Once + tasks func() map[coordinate]struct{} +} + +func newCpusetFixer(d *Driver) CpusetFixer { + return &cpusetFixer{ + interval: cpusetReconcileInterval, + ctx: d.ctx, + logger: d.logger, + tasks: d.trackedTasks, + } +} + +// Start will start the background cpuset reconciliation until the cf context is +// cancelled for shutdown. +// +// Only runs if cgroups.v2 is in use. +func (cf *cpusetFixer) Start() { + cf.once.Do(func() { + if cgutil.UseV2 { + go cf.loop() + } + }) +} + +func (cf *cpusetFixer) loop() { + timer, cancel := helper.NewSafeTimer(0) + defer cancel() + + for { + select { + case <-cf.ctx.Done(): + return + case <-timer.C: + timer.Stop() + cf.apply() + timer.Reset(cf.interval) + } + } +} + +func (cf *cpusetFixer) apply() { + coordinates := cf.tasks() + for c := range coordinates { + cf.fix(c) + } +} + +func (cf *cpusetFixer) fix(c coordinate) { + source := c.NomadCgroup() + destination := c.DockerCgroup() + if err := cgutil.CopyCpuset(source, destination); err != nil { + cf.logger.Debug("failed to copy cpuset", "err", err) + } +} + +type coordinate struct { + containerID string + allocID string + task string + path string +} + +func (c coordinate) NomadCgroup() string { + parent, _ := cgutil.SplitPath(c.path) + return filepath.Join(cgutil.CgroupRoot, parent, cgutil.CgroupScope(c.allocID, c.task)) +} + +func (c coordinate) DockerCgroup() string { + parent, _ := cgutil.SplitPath(c.path) + return filepath.Join(cgutil.CgroupRoot, parent, fmt.Sprintf("docker-%s.scope", c.containerID)) +} + +func (d *Driver) trackedTasks() map[coordinate]struct{} { + d.tasks.lock.RLock() + defer d.tasks.lock.RUnlock() + + m := make(map[coordinate]struct{}, len(d.tasks.store)) + for _, h := range d.tasks.store { + m[coordinate{ + containerID: h.containerID, + allocID: h.task.AllocID, + task: h.task.Name, + path: h.task.Resources.LinuxResources.CpusetCgroupPath, + }] = struct{}{} + } + return m +} diff --git a/drivers/docker/reconcile_cpuset_noop.go b/drivers/docker/reconcile_cpuset_noop.go new file mode 100644 index 000000000000..9613a5ad2ee0 --- /dev/null +++ b/drivers/docker/reconcile_cpuset_noop.go @@ -0,0 +1,19 @@ +//go:build !linux + +package docker + +type CpusetFixer interface { + Start() +} + +func newCpusetFixer(*Driver) CpusetFixer { + return new(noop) +} + +type noop struct { + // empty +} + +func (*noop) Start() { + // empty +} diff --git a/drivers/docker/reconcile_cpuset_test.go b/drivers/docker/reconcile_cpuset_test.go new file mode 100644 index 000000000000..07ed71294fa7 --- /dev/null +++ b/drivers/docker/reconcile_cpuset_test.go @@ -0,0 +1,36 @@ +//go:build linux + +package docker + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/stretchr/testify/require" +) + +func TestCoordinate_NomadCgroup(t *testing.T) { + ci.Parallel(t) + + result := (coordinate{ + containerID: "c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1", + allocID: "27ee5321-28d6-22d7-9426-4e1888da8e7d", + task: "redis", + path: "/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope", + }).NomadCgroup() + exp := "/sys/fs/cgroup/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope" + require.Equal(t, exp, result) +} + +func TestCoordinate_DockerCgroup(t *testing.T) { + ci.Parallel(t) + + result := (coordinate{ + containerID: "c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1", + allocID: "27ee5321-28d6-22d7-9426-4e1888da8e7d", + task: "redis", + path: "/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope", + }).DockerCgroup() + exp := "/sys/fs/cgroup/nomad.scope/docker-c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1.scope" + require.Equal(t, exp, result) +} diff --git a/drivers/docker/reconciler.go b/drivers/docker/reconcile_dangling.go similarity index 100% rename from drivers/docker/reconciler.go rename to drivers/docker/reconcile_dangling.go diff --git a/drivers/docker/reconciler_test.go b/drivers/docker/reconcile_dangling_test.go similarity index 100% rename from drivers/docker/reconciler_test.go rename to drivers/docker/reconcile_dangling_test.go diff --git a/drivers/docker/util/stats_posix.go b/drivers/docker/util/stats_posix.go index ba1167f344b2..3a9f03f8f539 100644 --- a/drivers/docker/util/stats_posix.go +++ b/drivers/docker/util/stats_posix.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package util diff --git a/drivers/docker/utils_unix_test.go b/drivers/docker/utils_unix_test.go index e53c72bec8ae..0a61bd545395 100644 --- a/drivers/docker/utils_unix_test.go +++ b/drivers/docker/utils_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package docker diff --git a/drivers/docker/utils_windows_test.go b/drivers/docker/utils_windows_test.go index d433055e9615..72d7e0bbfc85 100644 --- a/drivers/docker/utils_windows_test.go +++ b/drivers/docker/utils_windows_test.go @@ -1,5 +1,4 @@ //go:build windows -// +build windows package docker diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index 422b8b42aab9..882e3ae62f12 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -9,11 +9,10 @@ import ( "sync" "time" - "github.com/hashicorp/nomad/client/lib/cgutil" - "github.com/hashicorp/nomad/drivers/shared/capabilities" - "github.com/hashicorp/consul-template/signals" hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/lib/cgutil" + "github.com/hashicorp/nomad/drivers/shared/capabilities" "github.com/hashicorp/nomad/drivers/shared/eventer" "github.com/hashicorp/nomad/drivers/shared/executor" "github.com/hashicorp/nomad/drivers/shared/resolvconf" diff --git a/drivers/exec/driver_test.go b/drivers/exec/driver_test.go index f5c8d9a2b57f..fed60bea9623 100644 --- a/drivers/exec/driver_test.go +++ b/drivers/exec/driver_test.go @@ -17,6 +17,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" ctestutils "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/executor" "github.com/hashicorp/nomad/helper/pluginutils/hclutils" @@ -31,25 +32,41 @@ import ( "github.com/stretchr/testify/require" ) +const ( + cgroupParent = "testing.slice" +) + func TestMain(m *testing.M) { if !testtask.Run() { os.Exit(m.Run()) } } -var testResources = &drivers.Resources{ - NomadResources: &structs.AllocatedTaskResources{ - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 128, +func testResources(allocID, task string) *drivers.Resources { + if allocID == "" || task == "" { + panic("must be set") + } + + r := &drivers.Resources{ + NomadResources: &structs.AllocatedTaskResources{ + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 128, + }, + Cpu: structs.AllocatedCpuResources{ + CpuShares: 100, + }, }, - Cpu: structs.AllocatedCpuResources{ - CpuShares: 100, + LinuxResources: &drivers.LinuxResources{ + MemoryLimitBytes: 134217728, + CPUShares: 100, }, - }, - LinuxResources: &drivers.LinuxResources{ - MemoryLimitBytes: 134217728, - CPUShares: 100, - }, + } + + if cgutil.UseV2 { + r.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, cgroupParent, cgutil.CgroupScope(allocID, task)) + } + + return r } func TestExecDriver_Fingerprint_NonLinux(t *testing.T) { @@ -100,7 +117,6 @@ func TestExecDriver_Fingerprint(t *testing.T) { func TestExecDriver_StartWait(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -108,34 +124,35 @@ func TestExecDriver_StartWait(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "cat", Args: []string{"/proc/self/cgroup"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Zero(result.ExitCode) - require.NoError(harness.DestroyTask(task.ID, true)) + require.Zero(t, result.ExitCode) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_StartWaitStopKill(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -143,29 +160,31 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/bash", Args: []string{"-c", "echo hi; sleep 600"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { harness.StopTask(task.ID, 2*time.Second, "SIGINT") @@ -173,9 +192,9 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { select { case result := <-ch: - require.False(result.Successful()) + require.False(t, result.Successful()) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -191,54 +210,55 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_StartWaitRecover(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) - dctx, dcancel := context.WithCancel(context.Background()) - defer dcancel() + dCtx, dCancel := context.WithCancel(context.Background()) + defer dCancel() - d := NewExecDriver(dctx, testlog.HCLogger(t)) + d := NewExecDriver(dCtx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/sleep", Args: []string{"5"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) ch, err := harness.WaitTask(ctx, handle.Config.ID) - require.NoError(err) + require.NoError(t, err) var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() result := <-ch - require.Error(result.Err) + require.Error(t, result.Err) }() - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) cancel() waitCh := make(chan struct{}) @@ -250,31 +270,30 @@ func TestExecDriver_StartWaitRecover(t *testing.T) { select { case <-waitCh: status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateRunning, status.State) + require.NoError(t, err) + require.Equal(t, drivers.TaskStateRunning, status.State) case <-time.After(1 * time.Second): - require.Fail("timeout waiting for task wait to cancel") + require.Fail(t, "timeout waiting for task wait to cancel") } // Loose task d.(*Driver).tasks.Delete(task.ID) _, err = harness.InspectTask(task.ID) - require.Error(err) + require.Error(t, err) - require.NoError(harness.RecoverTask(handle)) + require.NoError(t, harness.RecoverTask(handle)) status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateRunning, status.State) + require.NoError(t, err) + require.Equal(t, drivers.TaskStateRunning, status.State) - require.NoError(harness.StopTask(task.ID, 0, "")) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.StopTask(task.ID, 0, "")) + require.NoError(t, harness.DestroyTask(task.ID, true)) } // TestExecDriver_NoOrphans asserts that when the main // task dies, the orphans in the PID namespaces are killed by the kernel func TestExecDriver_NoOrphans(t *testing.T) { ci.Parallel(t) - r := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -291,13 +310,19 @@ func TestExecDriver_NoOrphans(t *testing.T) { } var data []byte - r.NoError(basePlug.MsgPackEncode(&data, config)) + require.NoError(t, basePlug.MsgPackEncode(&data, config)) baseConfig := &basePlug.Config{PluginConfig: data} - r.NoError(harness.SetConfig(baseConfig)) + require.NoError(t, harness.SetConfig(baseConfig)) + allocID := uuid.Generate() task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "test", + AllocID: allocID, + ID: uuid.Generate(), + Name: "test", + } + + if cgutil.UseV2 { + task.Resources = testResources(allocID, "test") } cleanup := harness.MkAllocDir(task, true) @@ -307,21 +332,21 @@ func TestExecDriver_NoOrphans(t *testing.T) { taskConfig["command"] = "/bin/sh" // print the child PID in the task PID namespace, then sleep for 5 seconds to give us a chance to examine processes taskConfig["args"] = []string{"-c", fmt.Sprintf(`sleep 3600 & sleep 20`)} - r.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) handle, _, err := harness.StartTask(task) - r.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) waitCh, err := harness.WaitTask(context.Background(), handle.Config.ID) - r.NoError(err) + require.NoError(t, err) - r.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) var childPids []int taskState := TaskState{} testutil.WaitForResult(func() (bool, error) { - r.NoError(handle.GetDriverState(&taskState)) + require.NoError(t, handle.GetDriverState(&taskState)) if taskState.Pid == 0 { return false, fmt.Errorf("task PID is zero") } @@ -343,14 +368,14 @@ func TestExecDriver_NoOrphans(t *testing.T) { } return true, nil }, func(err error) { - r.NoError(err) + require.NoError(t, err) }) select { case result := <-waitCh: - r.True(result.Successful(), "command failed: %#v", result) + require.True(t, result.Successful(), "command failed: %#v", result) case <-time.After(30 * time.Second): - r.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // isProcessRunning returns an error if process is not running @@ -369,7 +394,7 @@ func TestExecDriver_NoOrphans(t *testing.T) { } // task should be dead - r.Error(isProcessRunning(taskState.Pid)) + require.Error(t, isProcessRunning(taskState.Pid)) // all children should eventually be killed by OS testutil.WaitForResult(func() (bool, error) { @@ -384,13 +409,12 @@ func TestExecDriver_NoOrphans(t *testing.T) { } return true, nil }, func(err error) { - r.NoError(err) + require.NoError(t, err) }) } func TestExecDriver_Stats(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) dctx, dcancel := context.WithCancel(context.Background()) @@ -398,45 +422,47 @@ func TestExecDriver_Stats(t *testing.T) { d := NewExecDriver(dctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/sleep", Args: []string{"5"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) ctx, cancel := context.WithCancel(context.Background()) defer cancel() statsCh, err := harness.TaskStats(ctx, task.ID, time.Second*10) - require.NoError(err) + require.NoError(t, err) select { case stats := <-statsCh: - require.NotEmpty(stats.ResourceUsage.MemoryStats.Measured) - require.NotZero(stats.Timestamp) - require.WithinDuration(time.Now(), time.Unix(0, stats.Timestamp), time.Second) + require.NotEmpty(t, stats.ResourceUsage.MemoryStats.Measured) + require.NotZero(t, stats.Timestamp) + require.WithinDuration(t, time.Now(), time.Unix(0, stats.Timestamp), time.Second) case <-time.After(time.Second): - require.Fail("timeout receiving from channel") + require.Fail(t, "timeout receiving from channel") } - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -444,10 +470,12 @@ func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "test"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -461,34 +489,33 @@ func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { fmt.Sprintf(`sleep 1; echo -n %s > /alloc/%s`, string(exp), file), }, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) // Task should terminate quickly waitCh, err := harness.WaitTask(context.Background(), task.ID) - require.NoError(err) + require.NoError(t, err) select { case res := <-waitCh: - require.True(res.Successful(), "task should have exited successfully: %v", res) + require.True(t, res.Successful(), "task should have exited successfully: %v", res) case <-time.After(time.Duration(testutil.TestMultiplier()*5) * time.Second): - require.Fail("timeout waiting for task") + require.Fail(t, "timeout waiting for task") } // Check that data was written to the shared alloc directory. outputFile := filepath.Join(task.TaskDir().SharedAllocDir, file) act, err := ioutil.ReadFile(outputFile) - require.NoError(err) - require.Exactly(exp, act) + require.NoError(t, err) + require.Exactly(t, exp, act) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_User(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -496,11 +523,13 @@ func TestExecDriver_User(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", User: "alice", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -509,11 +538,11 @@ func TestExecDriver_User(t *testing.T) { Command: "/bin/sleep", Args: []string{"100"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.Error(err) - require.Nil(handle) + require.Error(t, err) + require.Nil(t, handle) msg := "user alice" if !strings.Contains(err.Error(), msg) { @@ -525,18 +554,20 @@ func TestExecDriver_User(t *testing.T) { // executes commands inside the container. func TestExecDriver_HandlerExec(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) + ctestutils.CgroupsCompatibleV1(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -545,11 +576,11 @@ func TestExecDriver_HandlerExec(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) // Exec a command that should work and dump the environment // TODO: enable section when exec env is fully loaded @@ -575,8 +606,8 @@ func TestExecDriver_HandlerExec(t *testing.T) { // Assert cgroup membership res, err := harness.ExecTask(task.ID, []string{"/bin/cat", "/proc/self/cgroup"}, time.Second) - require.NoError(err) - require.True(res.ExitResult.Successful()) + require.NoError(t, err) + require.True(t, res.ExitResult.Successful()) found := false for _, line := range strings.Split(string(res.Stdout), "\n") { // Every cgroup entry should be /nomad/$ALLOC_ID @@ -594,41 +625,41 @@ func TestExecDriver_HandlerExec(t *testing.T) { } found = true } - require.True(found, "exec'd command isn't in the task's cgroup") + require.True(t, found, "exec'd command isn't in the task's cgroup") // Exec a command that should fail res, err = harness.ExecTask(task.ID, []string{"/usr/bin/stat", "lkjhdsaflkjshowaisxmcvnlia"}, time.Second) - require.NoError(err) - require.False(res.ExitResult.Successful()) + require.NoError(t, err) + require.False(t, res.ExitResult.Successful()) if expected := "No such file or directory"; !bytes.Contains(res.Stdout, []byte(expected)) { t.Fatalf("expected output to contain %q but found: %q", expected, res.Stdout) } - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_DevicesAndMounts(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) tmpDir, err := ioutil.TempDir("", "exec_binds_mounts") - require.NoError(err) + require.NoError(t, err) defer os.RemoveAll(tmpDir) err = ioutil.WriteFile(filepath.Join(tmpDir, "testfile"), []byte("from-host"), 600) - require.NoError(err) + require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) defer cancel() d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ ID: uuid.Generate(), Name: "test", User: "root", // need permission to read mounts paths - Resources: testResources, + Resources: testResources(allocID, "test"), StdoutPath: filepath.Join(tmpDir, "task-stdout"), StderrPath: filepath.Join(tmpDir, "task-stderr"), Devices: []*drivers.DeviceConfig{ @@ -652,8 +683,8 @@ func TestExecDriver_DevicesAndMounts(t *testing.T) { }, } - require.NoError(ioutil.WriteFile(task.StdoutPath, []byte{}, 660)) - require.NoError(ioutil.WriteFile(task.StderrPath, []byte{}, 660)) + require.NoError(t, ioutil.WriteFile(task.StdoutPath, []byte{}, 660)) + require.NoError(t, ioutil.WriteFile(task.StderrPath, []byte{}, 660)) tc := &TaskConfig{ Command: "/bin/bash", @@ -669,38 +700,38 @@ touch /tmp/task-path-ro/testfile-from-ro && echo from-exec > /tmp/task-path-ro/ exit 0 `}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) stdout, err := ioutil.ReadFile(task.StdoutPath) - require.NoError(err) - require.Equal(`mounted device /inserted-random: 1:8 + require.NoError(t, err) + require.Equal(t, `mounted device /inserted-random: 1:8 reading from ro path: from-host reading from rw path: from-host overwriting file in rw succeeded writing new file in rw succeeded`, strings.TrimSpace(string(stdout))) stderr, err := ioutil.ReadFile(task.StderrPath) - require.NoError(err) - require.Equal(`touch: cannot touch '/tmp/task-path-ro/testfile': Read-only file system + require.NoError(t, err) + require.Equal(t, `touch: cannot touch '/tmp/task-path-ro/testfile': Read-only file system touch: cannot touch '/tmp/task-path-ro/testfile-from-ro': Read-only file system`, strings.TrimSpace(string(stderr))) // testing exit code last so we can inspect output first - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) fromRWContent, err := ioutil.ReadFile(filepath.Join(tmpDir, "testfile-from-rw")) - require.NoError(err) - require.Equal("from-exec", strings.TrimSpace(string(fromRWContent))) + require.NoError(t, err) + require.Equal(t, "from-exec", strings.TrimSpace(string(fromRWContent))) } func TestConfig_ParseAllHCL(t *testing.T) { @@ -719,13 +750,11 @@ config { var tc *TaskConfig hclutils.NewConfigParser(taskConfigSpec).ParseHCL(t, cfgStr, &tc) - require.EqualValues(t, expected, tc) } func TestExecDriver_NoPivotRoot(t *testing.T) { ci.Parallel(t) - r := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -741,14 +770,16 @@ func TestExecDriver_NoPivotRoot(t *testing.T) { } var data []byte - r.NoError(basePlug.MsgPackEncode(&data, config)) + require.NoError(t, basePlug.MsgPackEncode(&data, config)) bconfig := &basePlug.Config{PluginConfig: data} - r.NoError(harness.SetConfig(bconfig)) + require.NoError(t, harness.SetConfig(bconfig)) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -757,11 +788,11 @@ func TestExecDriver_NoPivotRoot(t *testing.T) { Command: "/bin/sleep", Args: []string{"100"}, } - r.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - r.NoError(err) - r.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) } func TestDriver_Config_validate(t *testing.T) { diff --git a/drivers/exec/driver_unix_test.go b/drivers/exec/driver_unix_test.go index 6d62902f8a0c..ce765835dc3c 100644 --- a/drivers/exec/driver_unix_test.go +++ b/drivers/exec/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package exec @@ -11,6 +10,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" ctestutils "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/capabilities" "github.com/hashicorp/nomad/drivers/shared/executor" @@ -26,7 +26,6 @@ import ( func TestExecDriver_StartWaitStop(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -34,29 +33,31 @@ func TestExecDriver_StartWaitStop(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } taskConfig := map[string]interface{}{ "command": "/bin/sleep", "args": []string{"600"}, } - require.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) defer harness.DestroyTask(task.ID, true) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { harness.StopTask(task.ID, 2*time.Second, "SIGKILL") @@ -64,9 +65,9 @@ func TestExecDriver_StartWaitStop(t *testing.T) { select { case result := <-ch: - require.Equal(int(unix.SIGKILL), result.Signal) + require.Equal(t, int(unix.SIGKILL), result.Signal) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -82,13 +83,12 @@ func TestExecDriver_StartWaitStop(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) } func TestExec_ExecTaskStreaming(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -102,6 +102,12 @@ func TestExec_ExecTaskStreaming(t *testing.T) { Name: "sleep", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = testResources(allocID, "sleep") + } + cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -109,14 +115,13 @@ func TestExec_ExecTaskStreaming(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.ExecTaskStreamingConformanceTests(t, harness, task.ID) - } // Tests that a given DNSConfig properly configures dns @@ -124,7 +129,6 @@ func TestExec_dnsConfig(t *testing.T) { ci.Parallel(t) ctestutils.RequireRoot(t) ctestutils.ExecCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -163,6 +167,11 @@ func TestExec_dnsConfig(t *testing.T) { DNS: c.cfg, } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.Resources = testResources(allocID, "sleep") + } + cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -170,10 +179,10 @@ func TestExec_dnsConfig(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.TestTaskDNSConfig(t, harness, task.ID, c.cfg) @@ -189,6 +198,12 @@ func TestExecDriver_Capabilities(t *testing.T) { Name: "sleep", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = testResources(allocID, "sleep") + } + for _, tc := range []struct { Name string CapAdd []string diff --git a/drivers/java/driver_test.go b/drivers/java/driver_test.go index 63cb4c12043d..cc7a74378053 100644 --- a/drivers/java/driver_test.go +++ b/drivers/java/driver_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "github.com/hashicorp/nomad/client/lib/cgutil" "io" "io/ioutil" "os" @@ -12,14 +13,13 @@ import ( "time" "github.com/hashicorp/nomad/ci" - dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" - ctestutil "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/pluginutils/hclutils" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" + dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -60,7 +60,6 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -72,6 +71,7 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { Args: []string{"1"}, JvmOpts: []string{"-Xmx64m", "-Xms32m"}, } + task := basicTask(t, "demo-app", tc) cleanup := harness.MkAllocDir(task, true) @@ -80,28 +80,27 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { copyFile("./test-resources/demoapp.jar", filepath.Join(task.TaskDir().Dir, "demoapp.jar"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Nil(result.Err) + require.Nil(t, result.Err) - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) // Get the stdout of the process and assert that it's not empty stdout, err := ioutil.ReadFile(filepath.Join(task.TaskDir().LogDir, "demo-app.stdout.0")) - require.NoError(err) - require.Contains(string(stdout), "Hello") + require.NoError(t, err) + require.Contains(t, string(stdout), "Hello") - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -121,12 +120,12 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { copyFile("./test-resources/demoapp.jar", filepath.Join(task.TaskDir().Dir, "demoapp.jar"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { time.Sleep(10 * time.Millisecond) @@ -135,9 +134,9 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { select { case result := <-ch: - require.False(result.Successful()) + require.False(t, result.Successful()) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -153,17 +152,16 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaDriver_Class_Start_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -182,21 +180,21 @@ func TestJavaDriver_Class_Start_Wait(t *testing.T) { copyFile("./test-resources/Hello.class", filepath.Join(task.TaskDir().Dir, "Hello.class"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Nil(result.Err) + require.Nil(t, result.Err) - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) // Get the stdout of the process and assert that it's not empty stdout, err := ioutil.ReadFile(filepath.Join(task.TaskDir().LogDir, "demo-app.stdout.0")) - require.NoError(err) - require.Contains(string(stdout), "Hello") + require.NoError(t, err) + require.Contains(t, string(stdout), "Hello") - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaCmdArgs(t *testing.T) { @@ -254,7 +252,6 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -274,7 +271,7 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { copyFile("./test-resources/Hello.class", filepath.Join(task.TaskDir().Dir, "Hello.class"), t) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.ExecTaskStreamingConformanceTests(t, harness, task.ID) @@ -283,9 +280,11 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { func basicTask(t *testing.T, name string, taskConfig *TaskConfig) *drivers.TaskConfig { t.Helper() + allocID := uuid.Generate() task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: name, + AllocID: allocID, + ID: uuid.Generate(), + Name: name, Resources: &drivers.Resources{ NomadResources: &structs.AllocatedTaskResources{ Memory: structs.AllocatedMemoryResources{ @@ -302,6 +301,10 @@ func basicTask(t *testing.T, name string, taskConfig *TaskConfig) *drivers.TaskC }, } + if cgutil.UseV2 { + task.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, name)) + } + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) return task } diff --git a/drivers/rawexec/driver_test.go b/drivers/rawexec/driver_test.go index 4b76350980fe..bac01470c708 100644 --- a/drivers/rawexec/driver_test.go +++ b/drivers/rawexec/driver_test.go @@ -323,6 +323,8 @@ func TestRawExecDriver_Start_Wait_AllocDir(t *testing.T) { func TestRawExecDriver_Start_Kill_Wait_Cgroup(t *testing.T) { ci.Parallel(t) ctestutil.ExecCompatible(t) + ctestutil.CgroupsCompatibleV1(t) // todo(shoenig) #12348 + require := require.New(t) pidFile := "pid" @@ -414,6 +416,9 @@ func TestRawExecDriver_Start_Kill_Wait_Cgroup(t *testing.T) { func TestRawExecDriver_Exec(t *testing.T) { ci.Parallel(t) + ctestutil.ExecCompatible(t) + ctestutil.CgroupsCompatibleV1(t) // todo(shoenig) #12348 + require := require.New(t) d := newEnabledRawExecDriver(t) diff --git a/drivers/rawexec/driver_unix_test.go b/drivers/rawexec/driver_unix_test.go index bf7e0c4e2c35..895906bc9131 100644 --- a/drivers/rawexec/driver_unix_test.go +++ b/drivers/rawexec/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package rawexec @@ -18,8 +17,11 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" + clienttestutil "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/testtask" "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" basePlug "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/drivers" dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" @@ -30,9 +32,7 @@ import ( func TestRawExecDriver_User(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } + clienttestutil.RequireLinux(t) require := require.New(t) d := newEnabledRawExecDriver(t) @@ -205,13 +205,8 @@ func TestRawExecDriver_StartWaitStop(t *testing.T) { // task processes are cleaned up. func TestRawExecDriver_DestroyKillsAll(t *testing.T) { ci.Parallel(t) - - // This only works reliably with cgroup PID tracking, happens in linux only - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } - - require := require.New(t) + clienttestutil.RequireLinux(t) + clienttestutil.CgroupsCompatibleV1(t) // todo(shoenig): #12348 d := newEnabledRawExecDriver(t) harness := dtestutil.NewDriverHarness(t, d) @@ -222,6 +217,15 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { Name: "test", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = new(drivers.Resources) + task.Resources.NomadResources = new(structs.AllocatedTaskResources) + task.Resources.LinuxResources = new(drivers.LinuxResources) + task.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, "test")) + } + cleanup := harness.MkAllocDir(task, true) defer cleanup() @@ -229,20 +233,20 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { taskConfig["command"] = "/bin/sh" taskConfig["args"] = []string{"-c", fmt.Sprintf(`sleep 3600 & echo "SLEEP_PID=$!"`)} - require.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) select { case result := <-ch: - require.True(result.Successful(), "command failed: %#v", result) + require.True(t, result.Successful(), "command failed: %#v", result) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } sleepPid := 0 @@ -268,7 +272,7 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { sleepPid = pid return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) // isProcessRunning returns an error if process is not running @@ -286,9 +290,9 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { return nil } - require.NoError(isProcessRunning(sleepPid)) + require.NoError(t, isProcessRunning(sleepPid)) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) testutil.WaitForResult(func() (bool, error) { err := isProcessRunning(sleepPid) @@ -302,7 +306,7 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) } @@ -342,9 +346,7 @@ func TestRawExec_ExecTaskStreaming(t *testing.T) { func TestRawExec_ExecTaskStreaming_User(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("skip, requires running on Linux for testing custom user setting") - } + clienttestutil.RequireLinux(t) d := newEnabledRawExecDriver(t) harness := dtestutil.NewDriverHarness(t, d) @@ -381,9 +383,7 @@ func TestRawExec_ExecTaskStreaming_User(t *testing.T) { func TestRawExecDriver_NoCgroup(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } + clienttestutil.RequireLinux(t) expectedBytes, err := ioutil.ReadFile("/proc/self/cgroup") require.NoError(t, err) diff --git a/drivers/shared/executor/executor.go b/drivers/shared/executor/executor.go index 5026730b6971..99819d85a66d 100644 --- a/drivers/shared/executor/executor.go +++ b/drivers/shared/executor/executor.go @@ -302,7 +302,8 @@ func (e *UniversalExecutor) Launch(command *ExecCommand) (*ProcessState, error) // Setup cgroups on linux if e.commandCfg.ResourceLimits || e.commandCfg.BasicProcessCgroup { - if err := e.configureResourceContainer(os.Getpid()); err != nil { + pid := os.Getpid() + if err := e.configureResourceContainer(pid); err != nil { return nil, err } } diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 989c4a062c4c..cdb14f64dba1 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -1,10 +1,10 @@ //go:build linux -// +build linux package executor import ( "context" + "errors" "fmt" "io" "os" @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/consul-template/signals" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/stats" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -37,10 +38,6 @@ import ( "golang.org/x/sys/unix" ) -const ( - defaultCgroupParent = "/nomad" -) - var ( // ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1 ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"} @@ -71,7 +68,6 @@ type LibcontainerExecutor struct { } func NewExecutorWithIsolation(logger hclog.Logger) Executor { - logger = logger.Named("isolated_executor") if err := shelpers.Init(); err != nil { logger.Error("unable to initialize stats", "error", err) @@ -665,14 +661,27 @@ func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { } func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { - // If resources are not limited then manually create cgroups needed if !command.ResourceLimits { - return configureBasicCgroups(cfg) + return cgutil.ConfigureBasicCgroups(cfg) } - id := uuid.Generate() - cfg.Cgroups.Path = filepath.Join("/", defaultCgroupParent, id) + // set cgroups path + if cgutil.UseV2 { + // in v2, the cgroup must have been created by the client already, + // which breaks a lot of existing tests that run drivers without a client + if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" { + return errors.New("cgroup path must be set") + } + parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath) + cfg.Cgroups.Path = filepath.Join("/", parent, cgroup) + } else { + // in v1, the cgroup is created using /nomad, which is a bug because it + // does not respect the cgroup_parent client configuration + // (but makes testing easy) + id := uuid.Generate() + cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id) + } if command.Resources == nil || command.Resources.NomadResources == nil { return nil @@ -715,44 +724,6 @@ func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { return nil } -func configureBasicCgroups(cfg *lconfigs.Config) error { - id := uuid.Generate() - - // Manually create freezer cgroup - - subsystem := "freezer" - - path, err := getCgroupPathHelper(subsystem, filepath.Join(defaultCgroupParent, id)) - if err != nil { - return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err) - } - - if err = os.MkdirAll(path, 0755); err != nil { - return err - } - - cfg.Cgroups.Paths = map[string]string{ - subsystem: path, - } - return nil -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) { cfg := &lconfigs.Config{ Cgroups: &lconfigs.Cgroup{ diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 67a991111471..d38072ad561c 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -82,6 +83,12 @@ func testExecutorCommandWithChroot(t *testing.T) *testExecCmd { }, } + if cgutil.UseV2 { + cmd.Resources.LinuxResources = &drivers.LinuxResources{ + CpusetCgroupPath: filepath.Join(cgutil.CgroupRoot, "testing.scope", cgutil.CgroupScope(alloc.ID, task.Name)), + } + } + testCmd := &testExecCmd{ command: cmd, allocDir: allocDir, @@ -164,8 +171,10 @@ func TestExecutor_Isolation_PID_and_IPC_hostMode(t *testing.T) { func TestExecutor_IsolationAndConstraints(t *testing.T) { ci.Parallel(t) - r := require.New(t) testutil.ExecCompatible(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig): hard codes cgroups v1 lookup + + r := require.New(t) testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir @@ -256,9 +265,10 @@ passwd` // hierarchy created for this process func TestExecutor_CgroupPaths(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + require := require.New(t) + testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir execCmd.Cmd = "/bin/bash" @@ -280,27 +290,33 @@ func TestExecutor_CgroupPaths(t *testing.T) { tu.WaitForResult(func() (bool, error) { output := strings.TrimSpace(testExecCmd.stdout.String()) - // Verify that we got some cgroups - if !strings.Contains(output, ":devices:") { - return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) - } - lines := strings.Split(output, "\n") - for _, line := range lines { - // Every cgroup entry should be /nomad/$ALLOC_ID - if line == "" { - continue + switch cgutil.UseV2 { + case true: + isScope := strings.HasSuffix(output, ".scope") + require.True(isScope) + case false: + // Verify that we got some cgroups + if !strings.Contains(output, ":devices:") { + return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) } + lines := strings.Split(output, "\n") + for _, line := range lines { + // Every cgroup entry should be /nomad/$ALLOC_ID + if line == "" { + continue + } - // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker - // don't isolate it by default. - // :: filters out odd empty cgroup found in latest Ubuntu lines, e.g. 0::/user.slice/user-1000.slice/session-17.scope - // that is also not used for isolation - if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { - continue - } + // Skip rdma & misc subsystem; rdma was added in most recent kernels and libcontainer/docker + // don't isolate it by default. + // :: filters out odd empty cgroup found in latest Ubuntu lines, e.g. 0::/user.slice/user-1000.slice/session-17.scope + // that is also not used for isolation + if strings.Contains(line, ":rdma:") || strings.Contains(line, ":misc:") || strings.Contains(line, "::") { + continue + } + if !strings.Contains(line, ":/nomad/") { + return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + } - if !strings.Contains(line, ":/nomad/") { - return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) } } return true, nil @@ -311,9 +327,10 @@ func TestExecutor_CgroupPaths(t *testing.T) { // are destroyed on shutdown func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + require := require.New(t) + testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir execCmd.Cmd = "/bin/bash" @@ -336,28 +353,34 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { var cgroupsPaths string tu.WaitForResult(func() (bool, error) { output := strings.TrimSpace(testExecCmd.stdout.String()) - // Verify that we got some cgroups - if !strings.Contains(output, ":devices:") { - return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) - } - lines := strings.Split(output, "\n") - for _, line := range lines { - // Every cgroup entry should be /nomad/$ALLOC_ID - if line == "" { - continue - } - // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker - // don't isolate it by default. - if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { - continue + switch cgutil.UseV2 { + case true: + isScope := strings.HasSuffix(output, ".scope") + require.True(isScope) + case false: + // Verify that we got some cgroups + if !strings.Contains(output, ":devices:") { + return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) } + lines := strings.Split(output, "\n") + for _, line := range lines { + // Every cgroup entry should be /nomad/$ALLOC_ID + if line == "" { + continue + } - if !strings.Contains(line, ":/nomad/") { - return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker + // don't isolate it by default. + if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { + continue + } + + if !strings.Contains(line, ":/nomad/") { + return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + } } } - cgroupsPaths = output return true, nil }, func(err error) { t.Error(err) }) @@ -383,7 +406,7 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { continue } - p, err := getCgroupPathHelper(subsystem, cgroup) + p, err := cgutil.GetCgroupPathHelperV1(subsystem, cgroup) require.NoError(err) require.Falsef(cgroups.PathExists(p), "cgroup for %s %s still exists", subsystem, cgroup) } @@ -433,8 +456,10 @@ func TestUniversalExecutor_LookupTaskBin(t *testing.T) { // Exec Launch looks for the binary only inside the chroot func TestExecutor_EscapeContainer(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig) kills the terminal, probably defaulting to / + + require := require.New(t) testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir diff --git a/drivers/shared/executor/executor_test.go b/drivers/shared/executor/executor_test.go index 4d930843c1bf..57fbbb015d0e 100644 --- a/drivers/shared/executor/executor_test.go +++ b/drivers/shared/executor/executor_test.go @@ -18,7 +18,9 @@ import ( hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" + "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -90,6 +92,10 @@ func testExecutorCommand(t *testing.T) *testExecCmd { }, } + if cgutil.UseV2 { + cmd.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.scope", cgutil.CgroupScope(alloc.ID, task.Name)) + } + testCmd := &testExecCmd{ command: cmd, allocDir: allocDir, @@ -253,6 +259,8 @@ func TestExecutor_Start_Wait_Children(t *testing.T) { func TestExecutor_WaitExitSignal(t *testing.T) { ci.Parallel(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig) #12351 + for name, factory := range executorFactories { t.Run(name, func(t *testing.T) { testExecCmd := testExecutorCommand(t) @@ -519,6 +527,7 @@ func copyFile(t *testing.T, src, dst string) { func TestExecutor_Start_Kill_Immediately_NoGrace(t *testing.T) { ci.Parallel(t) for name, factory := range executorFactories { + t.Run(name, func(t *testing.T) { require := require.New(t) testExecCmd := testExecutorCommand(t) diff --git a/drivers/shared/executor/executor_universal_linux.go b/drivers/shared/executor/executor_universal_linux.go index d5076a8d3d6a..7c97681df55c 100644 --- a/drivers/shared/executor/executor_universal_linux.go +++ b/drivers/shared/executor/executor_universal_linux.go @@ -10,6 +10,7 @@ import ( "github.com/containernetworking/plugins/pkg/ns" multierror "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/plugins/drivers" "github.com/opencontainers/runc/libcontainer/cgroups" cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" @@ -77,8 +78,7 @@ func (e *UniversalExecutor) configureResourceContainer(pid int) error { cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule) } - err := configureBasicCgroups(cfg) - if err != nil { + if err := cgutil.ConfigureBasicCgroups(cfg); err != nil { // Log this error to help diagnose cases where nomad is run with too few // permissions, but don't return an error. There is no separate check for // cgroup creation permissions, so this may be the happy path. diff --git a/drivers/shared/executor/client.go b/drivers/shared/executor/grpc_client.go similarity index 100% rename from drivers/shared/executor/client.go rename to drivers/shared/executor/grpc_client.go diff --git a/drivers/shared/executor/server.go b/drivers/shared/executor/grpc_server.go similarity index 100% rename from drivers/shared/executor/server.go rename to drivers/shared/executor/grpc_server.go diff --git a/drivers/shared/executor/resource_container_linux.go b/drivers/shared/executor/resource_container_linux.go index 50ab7cef9e6e..dbdf83f16282 100644 --- a/drivers/shared/executor/resource_container_linux.go +++ b/drivers/shared/executor/resource_container_linux.go @@ -30,6 +30,7 @@ func (rc *resourceContainerContext) isEmpty() bool { return rc.groups == nil } +// todo(shoenig) cgroups.v2 #12351 func (rc *resourceContainerContext) getAllPidsByCgroup() (map[int]*nomadPid, error) { nPids := map[int]*nomadPid{} diff --git a/go.mod b/go.mod index 3e522bd891ff..13de442d7e74 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,7 @@ require ( github.com/mitchellh/mapstructure v1.4.3 github.com/mitchellh/reflectwalk v1.0.2 github.com/moby/sys/mount v0.3.0 - github.com/moby/sys/mountinfo v0.5.0 + github.com/moby/sys/mountinfo v0.6.0 github.com/opencontainers/runc v1.0.3 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 github.com/pkg/errors v0.9.1 @@ -161,9 +161,9 @@ require ( github.com/boltdb/bolt v1.3.1 // indirect github.com/census-instrumentation/opencensus-proto v0.3.0 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect - github.com/checkpoint-restore/go-criu/v5 v5.0.0 // indirect + github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cheggaaa/pb/v3 v3.0.5 // indirect - github.com/cilium/ebpf v0.6.2 // indirect + github.com/cilium/ebpf v0.8.1 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4 // indirect @@ -172,7 +172,7 @@ require ( github.com/containerd/console v1.0.3 // indirect github.com/containerd/containerd v1.5.9 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect - github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 // indirect + github.com/cyphar/filepath-securejoin v0.2.3 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba // indirect github.com/digitalocean/godo v1.10.0 // indirect @@ -185,7 +185,7 @@ require ( github.com/envoyproxy/protoc-gen-validate v0.6.2 // indirect github.com/felixge/httpsnoop v1.0.1 // indirect github.com/go-ole/go-ole v1.2.6 // indirect - github.com/godbus/dbus/v5 v5.0.4 // indirect + github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/gojuno/minimock/v3 v3.0.6 // indirect github.com/golang-jwt/jwt/v4 v4.0.0 // indirect @@ -231,7 +231,7 @@ require ( github.com/oklog/run v1.0.1-0.20180308005104-6934b124db28 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.0.2 // indirect - github.com/opencontainers/selinux v1.8.2 // indirect + github.com/opencontainers/selinux v1.10.0 // indirect github.com/packethost/packngo v0.1.1-0.20180711074735-b9cb5096f54c // indirect github.com/pierrec/lz4 v2.5.2+incompatible // indirect github.com/pmezard/go-difflib v1.0.0 // indirect @@ -240,7 +240,7 @@ require ( github.com/prometheus/procfs v0.7.3 // indirect github.com/renier/xmlrpc v0.0.0-20170708154548-ce4a1a486c03 // indirect github.com/rogpeppe/go-internal v1.6.1 // indirect - github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128 // indirect + github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 // indirect github.com/sirupsen/logrus v1.8.1 // indirect github.com/softlayer/softlayer-go v0.0.0-20180806151055-260589d94c7d // indirect github.com/stretchr/objx v0.2.0 // indirect diff --git a/go.sum b/go.sum index 9a472d9f02ae..ff8ba072fda7 100644 --- a/go.sum +++ b/go.sum @@ -209,8 +209,9 @@ github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/checkpoint-restore/go-criu/v4 v4.1.0/go.mod h1:xUQBLp4RLc5zJtWY++yjOoMoB5lihDt7fai+75m+rGw= -github.com/checkpoint-restore/go-criu/v5 v5.0.0 h1:TW8f/UvntYoVDMN1K2HlT82qH1rb0sOjpGw3m6Ym+i4= github.com/checkpoint-restore/go-criu/v5 v5.0.0/go.mod h1:cfwC0EG7HMUenopBsUf9d89JlCLQIfgVcNsNN0t6T2M= +github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8= +github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E= github.com/cheggaaa/pb v1.0.27 h1:wIkZHkNfC7R6GI5w7l/PdAdzXzlrbcI3p8OAlnkTsnc= github.com/cheggaaa/pb v1.0.27/go.mod h1:pQciLPpbU0oxA0h+VJYYLxO+XeDQb5pZijXscXHm81s= github.com/cheggaaa/pb/v3 v3.0.5 h1:lmZOti7CraK9RSjzExsY53+WWfub9Qv13B5m4ptEoPE= @@ -222,8 +223,9 @@ github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmE github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775/go.mod h1:7cR51M8ViRLIdUjrmSXlK9pkrsDlLHbO8jiB8X8JnOc= github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= -github.com/cilium/ebpf v0.6.2 h1:iHsfF/t4aW4heW2YKfeHrVPGdtYTL4C4KocpM8KTSnI= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= +github.com/cilium/ebpf v0.8.1 h1:bLSSEbBLqGPXxls55pGr5qWZaTqcmfDJHhou7t254ao= +github.com/cilium/ebpf v0.8.1/go.mod h1:f5zLIM0FSNuAkSyLAN7X+Hy6yznlF1mNiWUMfxMtrgk= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible h1:C29Ae4G5GtYyYMm1aztcyj/J5ckgJm2zwdDajFbx1NY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3 h1:TJH+oke8D16535+jHExHj4nQvzlZrj7ug5D7I/orNUA= @@ -372,8 +374,8 @@ github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ github.com/creack/pty v1.1.17 h1:QeVUsEDNrLBW4tMgZHvxy18sKtr6VI492kBhUfhDJNI= github.com/creack/pty v1.1.17/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= -github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 h1:dCqRswe3ZAwkQWdvFLwRqmJCpGP3DWb7bFogdqY3+QU= -github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= +github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= +github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ= github.com/d2g/dhcp4client v1.0.0/go.mod h1:j0hNfjhrt2SxUOw55nL0ATM/z4Yt3t2Kd1mW34z5W5s= github.com/d2g/dhcp4server v0.0.0-20181031114812-7d4a0a7f59a5/go.mod h1:Eo87+Kg/IX2hfWJfwxMzLyuSZyxSoAug2nGa1G2QAi8= @@ -462,8 +464,9 @@ github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/frankban/quicktest v1.4.0/go.mod h1:36zfPVQyHxymz4cH7wlDmVwDrJuljRB60qkgn7rorfQ= github.com/frankban/quicktest v1.10.0/go.mod h1:ui7WezCLWMWxVWr1GETZY3smRy0G4KWq9vcPtJmFl7Y= -github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/frankban/quicktest v1.14.0 h1:+cqqvzZV87b4adx/5ayVOaYZ2CrvM4ejQvUdBzPPUss= +github.com/frankban/quicktest v1.14.0/go.mod h1:NeW+ay9A/U67EYXNFA1nPE8e/tnQv/09mUdL/ijj8og= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= @@ -515,8 +518,9 @@ github.com/godbus/dbus v0.0.0-20180201030542-885f9cc04c9c/go.mod h1:/YcGZj5zSblf github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e h1:BWhy2j3IXJhjCbC68FptL43tDKIq8FladmaTs3Xs7Z8= github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= +github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/googleapis v1.2.0/go.mod h1:Njal3psf3qN6dwBtQfUmBZh2ybovJ0tlu3o/AC7HYjU= github.com/gogo/googleapis v1.4.0/go.mod h1:5YRNX2z1oM5gXdAkurHa942MDgEJyk02w4OecKY87+c= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= @@ -985,8 +989,9 @@ github.com/moby/sys/mount v0.3.0 h1:bXZYMmq7DBQPwHRxH/MG+u9+XF90ZOwoXpHTOznMGp0= github.com/moby/sys/mount v0.3.0/go.mod h1:U2Z3ur2rXPFrFmy4q6WMwWrBOAQGYtYTRVM8BIvzbwk= github.com/moby/sys/mountinfo v0.4.0/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= github.com/moby/sys/mountinfo v0.4.1/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= -github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI= github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= +github.com/moby/sys/mountinfo v0.6.0 h1:gUDhXQx58YNrpHlK4nSL+7y2pxFZkUcXqzFDKWdC0Oo= +github.com/moby/sys/mountinfo v0.6.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= github.com/moby/sys/symlink v0.1.0/go.mod h1:GGDODQmbFOjFsXvfLVn3+ZRxkch54RkSiGqsZeMYowQ= github.com/moby/term v0.0.0-20200312100748-672ec06f55cd/go.mod h1:DdlQx2hp0Ss5/fLikoLlEeIYiATotOjgB//nb973jeo= github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 h1:dcztxKSvZ4Id8iPpHERQBbIJfabdt4wUm5qy3wOL2Zc= @@ -1064,8 +1069,9 @@ github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.m github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mod h1:r3f7wjNzSs2extwzU3Y+6pKfobzPh+kKFJ3ofN+3nfs= github.com/opencontainers/selinux v1.6.0/go.mod h1:VVGKuOLlE7v4PJyT6h7mNWvq1rzqiriPsEqVhc+svHE= github.com/opencontainers/selinux v1.8.0/go.mod h1:RScLhm78qiWa2gbVCcGkC7tCGdgk3ogry1nUQF8Evvo= -github.com/opencontainers/selinux v1.8.2 h1:c4ca10UMgRcvZ6h0K4HtS15UaVSBEaE+iln2LVpAuGc= github.com/opencontainers/selinux v1.8.2/go.mod h1:MUIHuUEvKB1wtJjQdOyYRgOnLD2xAPP8dBsCoU0KuF8= +github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= +github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/packethost/packngo v0.1.1-0.20180711074735-b9cb5096f54c h1:vwpFWvAO8DeIZfFeqASzZfsxuWPno9ncAebBEP0N3uE= @@ -1160,8 +1166,8 @@ github.com/sclevine/agouti v3.0.0+incompatible/go.mod h1:b4WX9W9L1sfQKXeJf1mUTLZ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= -github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128 h1:vWI7jlfJVWN//T8jrt5JduYkSid+Sl/fRz33J1jQ83k= -github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/shirou/gopsutil v0.0.0-20181107111621-48177ef5f880 h1:1Ge4j/3uB2rxzPWD3TC+daeCw+w91z8UCUL/7WH5gn8= @@ -1570,6 +1576,7 @@ golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210816074244-15123e1e1f71/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210816183151-1e6c022a8912/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210917161153-d61c044b1678/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211013075003-97ac67df715c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/lib/cpuset/cpuset.go b/lib/cpuset/cpuset.go index 0caa966710d3..9cea06cad877 100644 --- a/lib/cpuset/cpuset.go +++ b/lib/cpuset/cpuset.go @@ -28,6 +28,17 @@ func New(cpus ...uint16) CPUSet { return cpuset } +// Copy returns a deep copy of CPUSet c. +func (c CPUSet) Copy() CPUSet { + cpus := make(map[uint16]struct{}, len(c.cpus)) + for k := range c.cpus { + cpus[k] = struct{}{} + } + return CPUSet{ + cpus: cpus, + } +} + // String returns the cpuset as a comma delimited set of core values and ranged func (c CPUSet) String() string { if c.Size() == 0 { diff --git a/lib/cpuset/cpuset_test.go b/lib/cpuset/cpuset_test.go index 178517680b12..e417744b03d8 100644 --- a/lib/cpuset/cpuset_test.go +++ b/lib/cpuset/cpuset_test.go @@ -207,7 +207,7 @@ func TestParse(t *testing.T) { func TestCPUSet_String(t *testing.T) { ci.Parallel(t) - + cases := []struct { cpuset CPUSet expected string @@ -222,3 +222,16 @@ func TestCPUSet_String(t *testing.T) { require.Equal(t, c.expected, c.cpuset.String()) } } + +func TestCPUSet_Copy(t *testing.T) { + ci.Parallel(t) + + original := New(1, 2, 3, 4, 5) + copied := original.Copy() + require.True(t, original.Equals(copied)) + + delete(copied.cpus, 3) + require.False(t, original.Equals(copied)) + require.True(t, original.Equals(New(1, 2, 3, 4, 5))) + require.True(t, copied.Equals(New(1, 2, 4, 5))) +} diff --git a/plugins/drivers/driver.go b/plugins/drivers/driver.go index f47dc5a2815b..4b8cd7d13531 100644 --- a/plugins/drivers/driver.go +++ b/plugins/drivers/driver.go @@ -267,7 +267,7 @@ type TaskConfig struct { JobName string JobID string TaskGroupName string - Name string + Name string // task.Name Namespace string NodeName string NodeID string diff --git a/plugins/drivers/testutils/exec_testing.go b/plugins/drivers/testutils/exec_testing.go index 8d824a16d9dc..a1cd361b11d9 100644 --- a/plugins/drivers/testutils/exec_testing.go +++ b/plugins/drivers/testutils/exec_testing.go @@ -14,8 +14,10 @@ import ( "testing" "time" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/plugins/drivers" dproto "github.com/hashicorp/nomad/plugins/drivers/proto" + "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -192,9 +194,35 @@ func TestExecFSIsolation(t *testing.T, driver *DriverHarness, taskID string) { false, "") require.Zero(t, r.exitCode) - if !strings.Contains(r.stdout, ":freezer:/nomad") && !strings.Contains(r.stdout, "freezer:/docker") { - require.Fail(t, "unexpected freezer cgroup", "expected freezer to be /nomad/ or /docker/, but found:\n%s", r.stdout) + if !cgutil.UseV2 { + acceptable := []string{ + ":freezer:/nomad", ":freezer:/docker", + } + if testutil.IsCI() { + // github actions freezer cgroup + acceptable = append(acceptable, ":freezer:/actions_job") + } + ok := false + for _, freezerGroup := range acceptable { + if strings.Contains(r.stdout, freezerGroup) { + ok = true + break + } + } + if !ok { + require.Fail(t, "unexpected freezer cgroup", "expected freezer to be /nomad/ or /docker/, but found:\n%s", r.stdout) + } + } else { + info, _ := driver.PluginInfo() + if info.Name == "docker" { + // Note: docker on cgroups v2 now returns nothing + // root@97b4d3d33035:/# cat /proc/self/cgroup + // 0::/ + t.Skip("/proc/self/cgroup not useful in docker cgroups.v2") + } + // e.g. 0::/testing.slice/5bdbd6c2-8aba-3ab2-728b-0ff3a81727a9.sleep.scope + require.True(t, strings.HasSuffix(strings.TrimSpace(r.stdout), ".scope"), "actual stdout %q", r.stdout) } }) } diff --git a/plugins/drivers/testutils/testing.go b/plugins/drivers/testutils/testing.go index 5056344e0824..f0f6c6713c2c 100644 --- a/plugins/drivers/testutils/testing.go +++ b/plugins/drivers/testutils/testing.go @@ -4,17 +4,17 @@ import ( "context" "fmt" "io/ioutil" + "os" "path/filepath" "runtime" "strings" "time" - testing "github.com/mitchellh/go-testing-interface" - hclog "github.com/hashicorp/go-hclog" plugin "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/logmon" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/helper/testlog" @@ -24,6 +24,7 @@ import ( "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/drivers" "github.com/hashicorp/nomad/plugins/shared/hclspec" + testing "github.com/mitchellh/go-testing-interface" "github.com/stretchr/testify/require" ) @@ -34,6 +35,7 @@ type DriverHarness struct { t testing.T logger hclog.Logger impl drivers.DriverPlugin + cgroup string } func (h *DriverHarness) Impl() drivers.DriverPlugin { @@ -53,12 +55,10 @@ func NewDriverHarness(t testing.T, d drivers.DriverPlugin) *DriverHarness { ) raw, err := client.Dispense(base.PluginTypeDriver) - if err != nil { - t.Fatalf("err dispensing plugin: %v", err) - } + require.NoError(t, err, "failed to dispense plugin") dClient := raw.(drivers.DriverPlugin) - h := &DriverHarness{ + return &DriverHarness{ client: client, server: server, DriverPlugin: dClient, @@ -66,13 +66,36 @@ func NewDriverHarness(t testing.T, d drivers.DriverPlugin) *DriverHarness { t: t, impl: d, } +} - return h +// setCgroup creates a v2 cgroup for the task, as if a Client were initialized +// and managing the cgroup as it normally would. +// +// Uses testing.slice as a parent. +func (h *DriverHarness) setCgroup(allocID, task string) { + if cgutil.UseV2 { + h.cgroup = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, task)) + f, err := os.Create(h.cgroup) + if err != nil { + panic(err) + } + defer f.Close() + } } func (h *DriverHarness) Kill() { - h.client.Close() + _ = h.client.Close() h.server.Stop() + h.cleanupCgroup() +} + +func (h *DriverHarness) cleanupCgroup() { + if cgutil.UseV2 { + err := os.Remove(h.cgroup) + if err != nil { + panic(err) + } + } } // tinyChroot is useful for testing, where we do not use anything other than @@ -139,6 +162,7 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( // Create the mock allocation alloc := mock.Alloc() + alloc.ID = t.AllocID if t.Resources != nil { alloc.AllocatedResources.Tasks[task.Name] = t.Resources.NomadResources } @@ -157,6 +181,9 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( } } + // set cgroup + h.setCgroup(alloc.ID, task.Name) + //logmon if enableLogs { lm := logmon.NewLogMon(h.logger.Named("logmon")) @@ -189,6 +216,7 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( return func() { h.client.Close() allocDir.Destroy() + h.cleanupCgroup() } } diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 56d2d7c44f56..874d2158e473 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -94,6 +94,29 @@ be removed in future releases. The previous `Protocol` value can be viewed using the `-verbose` flag. +#### Linux Control Groups Version 2 + +Starting with Nomad 1.3.0, Linux systems configured to use [cgroups v2][cgroups2] are +now supported. A Nomad client will only activate its v2 control groups manager if the +system is configured with the cgroups2 controller mounted at `/sys/fs/cgroup`. This implies +Nomad will continue to fallback to the v1 control groups manager on systems +configured to run in hybrid mode, where the cgroups2 controller is typically mounted +at `/sys/fs/cgroup/unified`. Systems that do not support cgroups v2 are not affected. A +new client attribute `unique.cgroup.version` indicates which version of control groups +Nomad is using. + +When cgroups v2 are in use, Nomad uses `nomad.slice` as the [default parent][cgroup_parent] for cgroups +created on behalf of tasks. The cgroup created for a task is named in the form `-.scope`. +These cgroups are created by Nomad before a task starts. External task drivers that support +containerization should be updated to make use of the new cgroup locations. + +``` +➜ tree -d /sys/fs/cgroup/nomad.slice +/sys/fs/cgroup/nomad.slice +├── 8b8da4cf-8ebf-b578-0bcf-77190749abf3.redis.scope +└── a8c8e495-83c8-311b-4657-e6e3127e98bc.example.scope +``` + ## Nomad 1.2.6, 1.1.12, and 1.0.18 #### ACL requirement for the job parse endpoint @@ -1262,6 +1285,8 @@ state. Once that is done the client can be killed, the `data_dir` should be deleted and then Nomad 0.3.0 can be launched. [api_jobs_parse]: /api-docs/jobs#parse-job +[cgroups2]: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html +[cgroup_parent]: /docs/configuration/client#cgroup_parent [dangling-containers]: /docs/drivers/docker#dangling-containers [drain-api]: /api-docs/nodes#drain-node [drain-cli]: /docs/commands/node/drain