From 95ba78418f4d13b1854e09fd59c8b4fbaa19e6e2 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Wed, 23 Mar 2022 12:56:48 +0100 Subject: [PATCH 01/16] cli: update namespace wildcard help to be non-specific. A number of commands support namespace wildcard querying, so it should be up to the sub-command to detail support, rather than keeping this list up to date. --- command/meta.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/command/meta.go b/command/meta.go index 202421d5a5fe..736dfcfa2063 100644 --- a/command/meta.go +++ b/command/meta.go @@ -242,8 +242,8 @@ func generalOptionsUsage(usageOpts usageOptsFlags) string { -namespace= The target namespace for queries and actions bound to a namespace. Overrides the NOMAD_NAMESPACE environment variable if set. - If set to '*', job and alloc subcommands query all namespaces authorized - to user. + If set to '*', subcommands which support this functionality query + all namespaces authorized to user. Defaults to the "default" namespace. ` From 876e8168d333780a4cbac0fe71a14c5341d367bf Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 23 Mar 2022 09:21:26 -0400 Subject: [PATCH 02/16] CSI: enforce single access mode at validation time (#12337) A volume that has single-use access mode is feasibility checked during scheduling to ensure that only a single reader or writer claim exists. However, because feasibility checking is done one alloc at a time before the plan is written, a job that's misconfigured to have count > 1 that mounts one of these volumes will pass feasibility checking. Enforce the check at validation time instead to prevent us from even trying to evaluation a job that's misconfigured this way. --- .changelog/12337.txt | 3 ++ nomad/job_endpoint_test.go | 2 +- nomad/structs/csi_test.go | 2 +- nomad/structs/structs.go | 2 +- nomad/structs/volume_test.go | 91 ++++++++++++++++++++++++++++++++++ nomad/structs/volumes.go | 95 +++++++++++++++++++++++------------- 6 files changed, 158 insertions(+), 37 deletions(-) create mode 100644 .changelog/12337.txt create mode 100644 nomad/structs/volume_test.go diff --git a/.changelog/12337.txt b/.changelog/12337.txt new file mode 100644 index 000000000000..b6ef664d3a97 --- /dev/null +++ b/.changelog/12337.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where single-use access modes were not enforced during validation +``` diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 3cf238b35596..a94d2ab59eaa 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -756,7 +756,7 @@ func TestJobEndpoint_Register_ACL(t *testing.T) { Type: structs.VolumeTypeCSI, Source: "prod-db", AttachmentMode: structs.CSIVolumeAttachmentModeBlockDevice, - AccessMode: structs.CSIVolumeAccessModeSingleNodeWriter, + AccessMode: structs.CSIVolumeAccessModeMultiNodeMultiWriter, }, } diff --git a/nomad/structs/csi_test.go b/nomad/structs/csi_test.go index 9ef5a7f8b679..5da73a78b688 100644 --- a/nomad/structs/csi_test.go +++ b/nomad/structs/csi_test.go @@ -1002,7 +1002,7 @@ func TestDeleteNodeForType_Monolith_NilController(t *testing.T) { func TestDeleteNodeForType_Monolith_NilNode(t *testing.T) { ci.Parallel(t) - + plug := NewCSIPlugin("foo", 1000) plug.Nodes["n0"] = nil diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index c4ac6a652cfb..698512bad482 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6396,7 +6396,7 @@ func (tg *TaskGroup) Validate(j *Job) error { canaries = tg.Update.Canary } for name, volReq := range tg.Volumes { - if err := volReq.Validate(canaries); err != nil { + if err := volReq.Validate(tg.Count, canaries); err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf( "Task group volume validation for %s failed: %v", name, err)) } diff --git a/nomad/structs/volume_test.go b/nomad/structs/volume_test.go new file mode 100644 index 000000000000..cbe9ed6fe8ba --- /dev/null +++ b/nomad/structs/volume_test.go @@ -0,0 +1,91 @@ +package structs + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/stretchr/testify/require" +) + +func TestVolumeRequest_Validate(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + expected []string + canariesCount int + taskGroupCount int + req *VolumeRequest + }{ + { + name: "host volume with empty source", + expected: []string{"volume has an empty source"}, + req: &VolumeRequest{ + Type: VolumeTypeHost, + }, + }, + { + name: "host volume with CSI volume config", + expected: []string{ + "host volumes cannot have an access mode", + "host volumes cannot have an attachment mode", + "host volumes cannot have mount options", + "host volumes do not support per_alloc", + }, + req: &VolumeRequest{ + Type: VolumeTypeHost, + ReadOnly: false, + AccessMode: CSIVolumeAccessModeSingleNodeReader, + AttachmentMode: CSIVolumeAttachmentModeBlockDevice, + MountOptions: &CSIMountOptions{ + FSType: "ext4", + MountFlags: []string{"ro"}, + }, + PerAlloc: true, + }, + }, + { + name: "CSI volume multi-reader-single-writer access mode", + expected: []string{ + "volume with multi-node-single-writer access mode allows only one writer", + }, + taskGroupCount: 2, + req: &VolumeRequest{ + Type: VolumeTypeCSI, + AccessMode: CSIVolumeAccessModeMultiNodeSingleWriter, + }, + }, + { + name: "CSI volume single reader access mode", + expected: []string{ + "volume with single-node-reader-only access mode allows only one reader", + }, + taskGroupCount: 2, + req: &VolumeRequest{ + Type: VolumeTypeCSI, + AccessMode: CSIVolumeAccessModeSingleNodeReader, + ReadOnly: true, + }, + }, + { + name: "CSI volume per-alloc with canaries", + expected: []string{"volume cannot be per_alloc when canaries are in use"}, + canariesCount: 1, + req: &VolumeRequest{ + Type: VolumeTypeCSI, + PerAlloc: true, + }, + }, + } + + for _, tc := range testCases { + tc = tc + t.Run(tc.name, func(t *testing.T) { + err := tc.req.Validate(tc.taskGroupCount, tc.canariesCount) + for _, expected := range tc.expected { + require.Contains(t, err.Error(), expected) + } + }) + } + +} diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go index 5fe5238732d3..0f8b040de458 100644 --- a/nomad/structs/volumes.go +++ b/nomad/structs/volumes.go @@ -102,54 +102,81 @@ type VolumeRequest struct { PerAlloc bool } -func (v *VolumeRequest) Validate(canaries int) error { +func (v *VolumeRequest) Validate(taskGroupCount, canaries int) error { if !(v.Type == VolumeTypeHost || v.Type == VolumeTypeCSI) { return fmt.Errorf("volume has unrecognized type %s", v.Type) } var mErr multierror.Error - if v.Type == VolumeTypeHost && v.AttachmentMode != CSIVolumeAttachmentModeUnknown { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("host volumes cannot have an attachment mode")) + addErr := func(msg string, args ...interface{}) { + mErr.Errors = append(mErr.Errors, fmt.Errorf(msg, args...)) } - if v.Type == VolumeTypeHost && v.AccessMode != CSIVolumeAccessModeUnknown { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("host volumes cannot have an access mode")) - } - if v.Type == VolumeTypeHost && v.MountOptions != nil { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("host volumes cannot have mount options")) - } - if v.Type == VolumeTypeCSI && v.AttachmentMode == CSIVolumeAttachmentModeUnknown { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("CSI volumes must have an attachment mode")) - } - if v.Type == VolumeTypeCSI && v.AccessMode == CSIVolumeAccessModeUnknown { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("CSI volumes must have an access mode")) + + if v.Source == "" { + addErr("volume has an empty source") } - if v.AccessMode == CSIVolumeAccessModeSingleNodeReader || v.AccessMode == CSIVolumeAccessModeMultiNodeReader { - if !v.ReadOnly { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("%s volumes must be read-only", v.AccessMode)) + switch v.Type { + + case VolumeTypeHost: + if v.AttachmentMode != CSIVolumeAttachmentModeUnknown { + addErr("host volumes cannot have an attachment mode") + } + if v.AccessMode != CSIVolumeAccessModeUnknown { + addErr("host volumes cannot have an access mode") + } + if v.MountOptions != nil { + addErr("host volumes cannot have mount options") + } + if v.PerAlloc { + addErr("host volumes do not support per_alloc") } - } - if v.AttachmentMode == CSIVolumeAttachmentModeBlockDevice && v.MountOptions != nil { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("block devices cannot have mount options")) - } + case VolumeTypeCSI: - if v.PerAlloc && canaries > 0 { - mErr.Errors = append(mErr.Errors, - fmt.Errorf("volume cannot be per_alloc when canaries are in use")) - } + switch v.AttachmentMode { + case CSIVolumeAttachmentModeUnknown: + addErr("CSI volumes must have an attachment mode") + case CSIVolumeAttachmentModeBlockDevice: + if v.MountOptions != nil { + addErr("block devices cannot have mount options") + } + } + + switch v.AccessMode { + case CSIVolumeAccessModeUnknown: + addErr("CSI volumes must have an access mode") + case CSIVolumeAccessModeSingleNodeReader: + if !v.ReadOnly { + addErr("%s volumes must be read-only", v.AccessMode) + } + if taskGroupCount > 1 && !v.PerAlloc { + addErr("volume with %s access mode allows only one reader", v.AccessMode) + } + case CSIVolumeAccessModeSingleNodeWriter: + // note: we allow read-only mount of this volume, but only one + if taskGroupCount > 1 && !v.PerAlloc { + addErr("volume with %s access mode allows only one reader or writer", v.AccessMode) + } + case CSIVolumeAccessModeMultiNodeReader: + if !v.ReadOnly { + addErr("%s volumes must be read-only", v.AccessMode) + } + case CSIVolumeAccessModeMultiNodeSingleWriter: + if !v.ReadOnly && taskGroupCount > 1 && !v.PerAlloc { + addErr("volume with %s access mode allows only one writer", v.AccessMode) + } + case CSIVolumeAccessModeMultiNodeMultiWriter: + // note: we intentionally allow read-only mount of this mode + } + + if v.PerAlloc && canaries > 0 { + addErr("volume cannot be per_alloc when canaries are in use") + } - if v.Source == "" { - mErr.Errors = append(mErr.Errors, fmt.Errorf("volume has an empty source")) } + return mErr.ErrorOrNil() } From fe65d80501013f2e98066b9402da0054ee0bd4c2 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 23 Mar 2022 10:39:28 -0400 Subject: [PATCH 03/16] CSI: fix timestamp from volume snapshot responses (#12352) Listing snapshots was incorrectly returning nanoseconds instead of seconds, and formatting of timestamps both list and create snapshot was treating the timestamp as though it were nanoseconds instead of seconds. This resulted in create timestamps always being displayed as zero values. Fix the unit conversion error in the command line and the incorrect extraction in the CSI plugin client code. Beef up the unit tests to make sure this code is actually exercised. --- .changelog/12352.txt | 3 +++ command/volume_snapshot_list.go | 2 +- go.mod | 2 +- nomad/csi_endpoint_test.go | 3 +++ plugins/csi/client_test.go | 24 +++++++++++++++++------- plugins/csi/plugin.go | 2 +- 6 files changed, 26 insertions(+), 10 deletions(-) create mode 100644 .changelog/12352.txt diff --git a/.changelog/12352.txt b/.changelog/12352.txt new file mode 100644 index 000000000000..688a1a9ba544 --- /dev/null +++ b/.changelog/12352.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where volume snapshot timestamps were always zero values +``` diff --git a/command/volume_snapshot_list.go b/command/volume_snapshot_list.go index 814081aaab32..a27e71bfaaf8 100644 --- a/command/volume_snapshot_list.go +++ b/command/volume_snapshot_list.go @@ -186,7 +186,7 @@ func csiFormatSnapshots(snapshots []*api.CSISnapshot, verbose bool) string { v.ID, limit(v.ExternalSourceVolumeID, length), humanize.IBytes(uint64(v.SizeBytes)), - formatUnixNanoTime(v.CreateTime), + formatUnixNanoTime(v.CreateTime*1e9), // seconds to nanoseconds v.IsReady, )) } diff --git a/go.mod b/go.mod index 3e522bd891ff..0218687a452d 100644 --- a/go.mod +++ b/go.mod @@ -122,6 +122,7 @@ require ( golang.org/x/sys v0.0.0-20220114195835-da31bd327af9 golang.org/x/time v0.0.0-20200630173020-3af7569d3a1e google.golang.org/grpc v1.44.0 + google.golang.org/protobuf v1.27.1 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 gopkg.in/tomb.v2 v2.0.0-20140626144623-14b3d72120e8 oss.indeed.com/go/libtime v1.5.0 @@ -265,7 +266,6 @@ require ( google.golang.org/api v0.60.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20211104193956-4c6863e31247 // indirect - google.golang.org/protobuf v1.27.1 // indirect gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect gopkg.in/fsnotify.v1 v1.4.7 // indirect gopkg.in/resty.v1 v1.12.0 // indirect diff --git a/nomad/csi_endpoint_test.go b/nomad/csi_endpoint_test.go index 6e91c4e7873e..78e806a99269 100644 --- a/nomad/csi_endpoint_test.go +++ b/nomad/csi_endpoint_test.go @@ -4,6 +4,7 @@ import ( "fmt" "strings" "testing" + "time" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/acl" @@ -1322,12 +1323,14 @@ func TestCSIVolumeEndpoint_CreateSnapshot(t *testing.T) { testutil.WaitForLeader(t, srv.RPC) + now := time.Now().Unix() fake := newMockClientCSI() fake.NextCreateSnapshotError = nil fake.NextCreateSnapshotResponse = &cstructs.ClientCSIControllerCreateSnapshotResponse{ ID: "snap-12345", ExternalSourceVolumeID: "vol-12345", SizeBytes: 42, + CreateTime: now, IsReady: true, } diff --git a/plugins/csi/client_test.go b/plugins/csi/client_test.go index 35abf626a822..3d67381a8dca 100644 --- a/plugins/csi/client_test.go +++ b/plugins/csi/client_test.go @@ -6,6 +6,7 @@ import ( "fmt" "path/filepath" "testing" + "time" csipbv1 "github.com/container-storage-interface/spec/lib/go/csi" "github.com/golang/protobuf/ptypes/wrappers" @@ -16,6 +17,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/timestamppb" ) func newTestClient(t *testing.T) (*fake.IdentityClient, *fake.ControllerClient, *fake.NodeClient, CSIPlugin) { @@ -990,6 +992,8 @@ func TestClient_RPC_ControllerListVolume(t *testing.T) { func TestClient_RPC_ControllerCreateSnapshot(t *testing.T) { ci.Parallel(t) + now := time.Now() + cases := []struct { Name string Request *ControllerCreateSnapshotRequest @@ -1024,6 +1028,7 @@ func TestClient_RPC_ControllerCreateSnapshot(t *testing.T) { SizeBytes: 100000, SnapshotId: "snap-12345", SourceVolumeId: "vol-12345", + CreationTime: timestamppb.New(now), ReadyToUse: true, }, }, @@ -1039,11 +1044,13 @@ func TestClient_RPC_ControllerCreateSnapshot(t *testing.T) { // note: there's nothing interesting to assert about the response // here other than that we don't throw a NPE during transformation // from protobuf to our struct - _, err := client.ControllerCreateSnapshot(context.TODO(), tc.Request) + resp, err := client.ControllerCreateSnapshot(context.TODO(), tc.Request) if tc.ExpectedErr != nil { require.EqualError(t, err, tc.ExpectedErr.Error()) } else { require.NoError(t, err, tc.Name) + require.NotZero(t, resp.Snapshot.CreateTime) + require.Equal(t, now.Second(), time.Unix(resp.Snapshot.CreateTime, 0).Second()) } }) } @@ -1120,16 +1127,15 @@ func TestClient_RPC_ControllerListSnapshots(t *testing.T) { }, } + now := time.Now() + for _, tc := range cases { t.Run(tc.Name, func(t *testing.T) { _, cc, _, client := newTestClient(t) defer client.Close() cc.NextErr = tc.ResponseErr - if tc.ResponseErr != nil { - // note: there's nothing interesting to assert here other than - // that we don't throw a NPE during transformation from - // protobuf to our struct + if tc.ResponseErr == nil { cc.NextListSnapshotsResponse = &csipbv1.ListSnapshotsResponse{ Entries: []*csipbv1.ListSnapshotsResponse_Entry{ { @@ -1138,6 +1144,7 @@ func TestClient_RPC_ControllerListSnapshots(t *testing.T) { SnapshotId: "snap-12345", SourceVolumeId: "vol-12345", ReadyToUse: true, + CreationTime: timestamppb.New(now), }, }, }, @@ -1152,7 +1159,10 @@ func TestClient_RPC_ControllerListSnapshots(t *testing.T) { } require.NoError(t, err, tc.Name) require.NotNil(t, resp) - + require.Len(t, resp.Entries, 1) + require.NotZero(t, resp.Entries[0].Snapshot.CreateTime) + require.Equal(t, now.Second(), + time.Unix(resp.Entries[0].Snapshot.CreateTime, 0).Second()) }) } } @@ -1298,7 +1308,7 @@ func TestClient_RPC_NodePublishVolume(t *testing.T) { } func TestClient_RPC_NodeUnpublishVolume(t *testing.T) { ci.Parallel(t) - + cases := []struct { Name string ExternalID string diff --git a/plugins/csi/plugin.go b/plugins/csi/plugin.go index c03390688b69..3ceb2621c57a 100644 --- a/plugins/csi/plugin.go +++ b/plugins/csi/plugin.go @@ -788,7 +788,7 @@ func NewListSnapshotsResponse(resp *csipbv1.ListSnapshotsResponse) *ControllerLi SizeBytes: snap.GetSizeBytes(), ID: snap.GetSnapshotId(), SourceVolumeID: snap.GetSourceVolumeId(), - CreateTime: int64(snap.GetCreationTime().GetNanos()), + CreateTime: snap.GetCreationTime().GetSeconds(), IsReady: snap.GetReadyToUse(), }, }) From 1e55a35848a891910799c7e895fae4db5cdca24f Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 23 Mar 2022 12:01:08 -0400 Subject: [PATCH 04/16] csi: set gRPC authority header for unix domain socket (#12359) The go-grpc library used by most CSI plugins doesn't require the authority header to be set, which violates the HTTP2 spec but doesn't impact Nomad because both sides of the connection are using the same library. But plugins written in other languages (`democratic-csi` for example) may have more strictly conforming gRPC server libraries and we need to set the authority header manually. --- .changelog/12359.txt | 3 +++ plugins/csi/client.go | 1 + 2 files changed, 4 insertions(+) create mode 100644 .changelog/12359.txt diff --git a/.changelog/12359.txt b/.changelog/12359.txt new file mode 100644 index 000000000000..ff00dad7d977 --- /dev/null +++ b/.changelog/12359.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where plugins written in NodeJS could fail to fingerprint +``` diff --git a/plugins/csi/client.go b/plugins/csi/client.go index 8c5185bbefff..1a4dcf0572b2 100644 --- a/plugins/csi/client.go +++ b/plugins/csi/client.go @@ -162,6 +162,7 @@ func newGrpcConn(addr string, logger hclog.Logger) (*grpc.ClientConn, error) { grpc.WithInsecure(), grpc.WithUnaryInterceptor(logging.UnaryClientInterceptor(logger)), grpc.WithStreamInterceptor(logging.StreamClientInterceptor(logger)), + grpc.WithAuthority("localhost"), grpc.WithDialer(func(target string, timeout time.Duration) (net.Conn, error) { return net.DialTimeout("unix", target, timeout) }), From 5da1a31e945d69ddbe74d0cd2fed96c57e5d3c2f Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Mon, 28 Feb 2022 16:24:01 -0600 Subject: [PATCH 05/16] client: enable support for cgroups v2 This PR introduces support for using Nomad on systems with cgroups v2 [1] enabled as the cgroups controller mounted on /sys/fs/cgroups. Newer Linux distros like Ubuntu 21.10 are shipping with cgroups v2 only, causing problems for Nomad users. Nomad mostly "just works" with cgroups v2 due to the indirection via libcontainer, but not so for managing cpuset cgroups. Before, Nomad has been making use of a feature in v1 where a PID could be a member of more than one cgroup. In v2 this is no longer possible, and so the logic around computing cpuset values must be modified. When Nomad detects v2, it manages cpuset values in-process, rather than making use of cgroup heirarchy inheritence via shared/reserved parents. Nomad will only activate the v2 logic when it detects cgroups2 is mounted at /sys/fs/cgroups. This means on systems running in hybrid mode with cgroups2 mounted at /sys/fs/cgroups/unified (as is typical) Nomad will continue to use the v1 logic, and should operate as before. Systems that do not support cgroups v2 are also not affected. When v2 is activated, Nomad will create a parent called nomad.slice (unless otherwise configured in Client conifg), and create cgroups for tasks using naming convention -.scope. These follow the naming convention set by systemd and also used by Docker when cgroups v2 is detected. Client nodes now export a new fingerprint attribute, unique.cgroups.version which will be set to 'v1' or 'v2' to indicate the cgroups regime in use by Nomad. The new cpuset management strategy fixes #11705, where docker tasks that spawned processes on startup would "leak". In cgroups v2, the PIDs are started in the cgroup they will always live in, and thus the cause of the leak is eliminated. [1] https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html Closes #11289 Fixes #11705 #11773 #11933 --- .changelog/12274.txt | 3 + .github/workflows/test-core.yaml | 6 +- client/alloc_endpoint_test.go | 1 + client/allocrunner/testing.go | 2 +- client/client.go | 22 +- client/client_test.go | 3 +- client/config/config.go | 2 +- client/fingerprint/cgroup.go | 61 +++- client/fingerprint/cgroup_default.go | 1 - client/fingerprint/cgroup_linux.go | 22 +- client/fingerprint/cgroup_test.go | 138 +++++--- client/fingerprint/cpu_linux.go | 9 +- client/lib/cgutil/cgutil_default.go | 14 - client/lib/cgutil/cgutil_linux.go | 176 +++++----- client/lib/cgutil/cgutil_linux_test.go | 124 +++++++ client/lib/cgutil/cgutil_noop.go | 42 +++ client/lib/cgutil/cpuset_manager.go | 69 +++- client/lib/cgutil/cpuset_manager_default.go | 10 - client/lib/cgutil/cpuset_manager_test.go | 28 ++ ..._manager_linux.go => cpuset_manager_v1.go} | 184 ++++++++-- ...inux_test.go => cpuset_manager_v1_test.go} | 68 ++-- client/lib/cgutil/cpuset_manager_v2.go | 332 ++++++++++++++++++ client/lib/cgutil/cpuset_manager_v2_test.go | 90 +++++ client/testutil/driver_compatible.go | 59 ++-- client/testutil/driver_compatible_default.go | 22 ++ client/testutil/driver_compatible_linux.go | 56 +++ command/agent/config.go | 5 + drivers/docker/config.go | 4 +- drivers/docker/config_test.go | 2 +- drivers/docker/driver.go | 20 +- drivers/docker/driver_darwin.go | 2 + drivers/docker/driver_darwin_test.go | 2 + drivers/docker/driver_default.go | 1 - drivers/docker/driver_linux.go | 4 +- drivers/docker/driver_linux_test.go | 2 + drivers/docker/driver_test.go | 4 +- drivers/docker/driver_unix_test.go | 1 - drivers/docker/driver_windows.go | 2 + drivers/docker/driver_windows_test.go | 1 - drivers/docker/fingerprint.go | 7 +- drivers/docker/handle.go | 6 + drivers/docker/reconcile_cpuset.go | 125 +++++++ drivers/docker/reconcile_cpuset_noop.go | 19 + drivers/docker/reconcile_cpuset_test.go | 36 ++ .../{reconciler.go => reconcile_dangling.go} | 0 ...ler_test.go => reconcile_dangling_test.go} | 0 drivers/docker/util/stats_posix.go | 1 - drivers/docker/utils_unix_test.go | 1 - drivers/docker/utils_windows_test.go | 1 - drivers/exec/driver.go | 5 +- drivers/exec/driver_test.go | 283 ++++++++------- drivers/exec/driver_unix_test.go | 49 ++- drivers/java/driver_test.go | 63 ++-- drivers/rawexec/driver_test.go | 5 + drivers/rawexec/driver_unix_test.go | 52 +-- drivers/shared/executor/executor.go | 3 +- drivers/shared/executor/executor_linux.go | 67 +--- .../shared/executor/executor_linux_test.go | 105 +++--- drivers/shared/executor/executor_test.go | 9 + .../executor/executor_universal_linux.go | 4 +- .../executor/{client.go => grpc_client.go} | 0 .../executor/{server.go => grpc_server.go} | 0 .../executor/resource_container_linux.go | 1 + go.mod | 14 +- go.sum | 27 +- lib/cpuset/cpuset.go | 11 + lib/cpuset/cpuset_test.go | 15 +- plugins/drivers/driver.go | 2 +- plugins/drivers/testutils/exec_testing.go | 32 +- plugins/drivers/testutils/testing.go | 44 ++- .../content/docs/upgrade/upgrade-specific.mdx | 25 ++ 71 files changed, 1937 insertions(+), 669 deletions(-) create mode 100644 .changelog/12274.txt delete mode 100644 client/lib/cgutil/cgutil_default.go create mode 100644 client/lib/cgutil/cgutil_linux_test.go create mode 100644 client/lib/cgutil/cgutil_noop.go delete mode 100644 client/lib/cgutil/cpuset_manager_default.go create mode 100644 client/lib/cgutil/cpuset_manager_test.go rename client/lib/cgutil/{cpuset_manager_linux.go => cpuset_manager_v1.go} (59%) rename client/lib/cgutil/{cpuset_manager_linux_test.go => cpuset_manager_v1_test.go} (80%) create mode 100644 client/lib/cgutil/cpuset_manager_v2.go create mode 100644 client/lib/cgutil/cpuset_manager_v2_test.go create mode 100644 client/testutil/driver_compatible_default.go create mode 100644 client/testutil/driver_compatible_linux.go create mode 100644 drivers/docker/reconcile_cpuset.go create mode 100644 drivers/docker/reconcile_cpuset_noop.go create mode 100644 drivers/docker/reconcile_cpuset_test.go rename drivers/docker/{reconciler.go => reconcile_dangling.go} (100%) rename drivers/docker/{reconciler_test.go => reconcile_dangling_test.go} (100%) rename drivers/shared/executor/{client.go => grpc_client.go} (100%) rename drivers/shared/executor/{server.go => grpc_server.go} (100%) diff --git a/.changelog/12274.txt b/.changelog/12274.txt new file mode 100644 index 000000000000..3f47ae714a62 --- /dev/null +++ b/.changelog/12274.txt @@ -0,0 +1,3 @@ +```release-note:improvement +Enable support for cgroups v2 +``` diff --git a/.github/workflows/test-core.yaml b/.github/workflows/test-core.yaml index 5f2a66d3b841..672458ef3b9d 100644 --- a/.github/workflows/test-core.yaml +++ b/.github/workflows/test-core.yaml @@ -96,7 +96,7 @@ jobs: - client/devicemanager - client/dynamicplugins - client/fingerprint - # - client/lib/... + - client/lib/... - client/logmon - client/pluginmanager - client/state @@ -105,8 +105,8 @@ jobs: - client/taskenv - command - command/agent - # - drivers/docker - # - drivers/exec + - drivers/docker + - drivers/exec - drivers/java - drivers/rawexec - helper/... diff --git a/client/alloc_endpoint_test.go b/client/alloc_endpoint_test.go index 2e51a3702c46..272819ecc35c 100644 --- a/client/alloc_endpoint_test.go +++ b/client/alloc_endpoint_test.go @@ -992,6 +992,7 @@ func TestAlloc_ExecStreaming_ACL_WithIsolation_Image(t *testing.T) { // TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot asserts that token only needs // alloc-exec acl policy when chroot isolation is used func TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot(t *testing.T) { + ci.SkipSlow(t, "flaky on GHA; too much disk IO") ci.Parallel(t) if runtime.GOOS != "linux" || unix.Geteuid() != 0 { diff --git a/client/allocrunner/testing.go b/client/allocrunner/testing.go index da850719ef4e..41ee296ff838 100644 --- a/client/allocrunner/testing.go +++ b/client/allocrunner/testing.go @@ -70,7 +70,7 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu PrevAllocMigrator: allocwatcher.NoopPrevAlloc{}, DeviceManager: devicemanager.NoopMockManager(), DriverManager: drivermanager.TestDriverManager(t), - CpusetManager: cgutil.NoopCpusetManager(), + CpusetManager: new(cgutil.NoopCpusetManager), ServersContactedCh: make(chan struct{}), } return conf, cleanup diff --git a/client/client.go b/client/client.go index 826453170d8c..5b473a7fc773 100644 --- a/client/client.go +++ b/client/client.go @@ -365,7 +365,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie invalidAllocs: make(map[string]struct{}), serversContactedCh: make(chan struct{}), serversContactedOnce: sync.Once{}, - cpusetManager: cgutil.NewCpusetManager(cfg.CgroupParent, logger.Named("cpuset_manager")), + cpusetManager: cgutil.CreateCPUSetManager(cfg.CgroupParent, logger), EnterpriseClient: newEnterpriseClient(logger), } @@ -657,19 +657,23 @@ func (c *Client) init() error { // Ensure cgroups are created on linux platform if runtime.GOOS == "linux" && c.cpusetManager != nil { - err := c.cpusetManager.Init() - if err != nil { - // if the client cannot initialize the cgroup then reserved cores will not be reported and the cpuset manager - // will be disabled. this is common when running in dev mode under a non-root user for example - c.logger.Warn("could not initialize cpuset cgroup subsystem, cpuset management disabled", "error", err) - c.cpusetManager = cgutil.NoopCpusetManager() + // use the client configuration for reservable_cores if set + cores := c.config.ReservableCores + if len(cores) == 0 { + // otherwise lookup the effective cores from the parent cgroup + cores, _ = cgutil.GetCPUsFromCgroup(c.config.CgroupParent) + } + if cpuErr := c.cpusetManager.Init(cores); cpuErr != nil { + // If the client cannot initialize the cgroup then reserved cores will not be reported and the cpuset manager + // will be disabled. this is common when running in dev mode under a non-root user for example. + c.logger.Warn("failed to initialize cpuset cgroup subsystem, cpuset management disabled", "error", cpuErr) + c.cpusetManager = new(cgutil.NoopCpusetManager) } } return nil } -// reloadTLSConnections allows a client to reload its TLS configuration on the -// fly +// reloadTLSConnections allows a client to reload its TLS configuration on the fly func (c *Client) reloadTLSConnections(newConfig *nconfig.TLSConfig) error { var tlsWrap tlsutil.RegionWrapper if newConfig != nil && newConfig.EnableRPC { diff --git a/client/client_test.go b/client/client_test.go index bb99e6cc9208..c8cdb0f0a282 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/nomad/client/config" consulApi "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/fingerprint" + cstate "github.com/hashicorp/nomad/client/state" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/pluginutils/catalog" "github.com/hashicorp/nomad/helper/pluginutils/singleton" @@ -30,8 +31,6 @@ import ( psstructs "github.com/hashicorp/nomad/plugins/shared/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" - - cstate "github.com/hashicorp/nomad/client/state" "github.com/stretchr/testify/require" ) diff --git a/client/config/config.go b/client/config/config.go index 9ec256eabb6d..4767cd12e1ec 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -774,7 +774,7 @@ func DefaultConfig() *Config { CNIConfigDir: "/opt/cni/config", CNIInterfacePrefix: "eth", HostNetworks: map[string]*structs.ClientHostNetworkConfig{}, - CgroupParent: cgutil.DefaultCgroupParent, + CgroupParent: cgutil.GetCgroupParent(""), MaxDynamicPort: structs.DefaultMinDynamicPort, MinDynamicPort: structs.DefaultMaxDynamicPort, } diff --git a/client/fingerprint/cgroup.go b/client/fingerprint/cgroup.go index b0097984c79f..b24c3f0a3442 100644 --- a/client/fingerprint/cgroup.go +++ b/client/fingerprint/cgroup.go @@ -3,55 +3,86 @@ package fingerprint import ( "time" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/lib/cgutil" - - log "github.com/hashicorp/go-hclog" ) const ( + cgroupAvailable = "available" cgroupUnavailable = "unavailable" - interval = 15 + + cgroupMountPointAttribute = "unique.cgroup.mountpoint" + cgroupVersionAttribute = "unique.cgroup.version" + + cgroupDetectInterval = 15 * time.Second ) type CGroupFingerprint struct { - logger log.Logger + logger hclog.Logger lastState string mountPointDetector MountPointDetector + versionDetector CgroupVersionDetector } -// An interface to isolate calls to the cgroup library -// This facilitates testing where we can implement -// fake mount points to test various code paths +// MountPointDetector isolates calls to the cgroup library. +// +// This facilitates testing where we can implement fake mount points to test +// various code paths. type MountPointDetector interface { + // MountPoint returns a cgroup mount-point. + // + // In v1, this is one arbitrary subsystem (e.g. /sys/fs/cgroup/cpu). + // + // In v2, this is the actual root mount point (i.e. /sys/fs/cgroup). MountPoint() (string, error) } -// Implements the interface detector which calls the cgroups library directly +// DefaultMountPointDetector implements the interface detector which calls the cgroups +// library directly type DefaultMountPointDetector struct { } // MountPoint calls out to the default cgroup library. -func (b *DefaultMountPointDetector) MountPoint() (string, error) { +func (*DefaultMountPointDetector) MountPoint() (string, error) { return cgutil.FindCgroupMountpointDir() } +// CgroupVersionDetector isolates calls to the cgroup library. +type CgroupVersionDetector interface { + // CgroupVersion returns v1 or v2 depending on the cgroups version in use. + CgroupVersion() string +} + +// DefaultCgroupVersionDetector implements the version detector which calls the +// cgroups library directly. +type DefaultCgroupVersionDetector struct { +} + +func (*DefaultCgroupVersionDetector) CgroupVersion() string { + if cgutil.UseV2 { + return "v2" + } + return "v1" +} + // NewCGroupFingerprint returns a new cgroup fingerprinter -func NewCGroupFingerprint(logger log.Logger) Fingerprint { - f := &CGroupFingerprint{ +func NewCGroupFingerprint(logger hclog.Logger) Fingerprint { + return &CGroupFingerprint{ logger: logger.Named("cgroup"), lastState: cgroupUnavailable, - mountPointDetector: &DefaultMountPointDetector{}, + mountPointDetector: new(DefaultMountPointDetector), + versionDetector: new(DefaultCgroupVersionDetector), } - return f } // clearCGroupAttributes clears any node attributes related to cgroups that might // have been set in a previous fingerprint run. func (f *CGroupFingerprint) clearCGroupAttributes(r *FingerprintResponse) { - r.RemoveAttribute("unique.cgroup.mountpoint") + r.RemoveAttribute(cgroupMountPointAttribute) + r.RemoveAttribute(cgroupVersionAttribute) } // Periodic determines the interval at which the periodic fingerprinter will run. func (f *CGroupFingerprint) Periodic() (bool, time.Duration) { - return true, interval * time.Second + return true, cgroupDetectInterval } diff --git a/client/fingerprint/cgroup_default.go b/client/fingerprint/cgroup_default.go index 272540834ea8..f1e53c603885 100644 --- a/client/fingerprint/cgroup_default.go +++ b/client/fingerprint/cgroup_default.go @@ -1,5 +1,4 @@ //go:build !linux -// +build !linux package fingerprint diff --git a/client/fingerprint/cgroup_linux.go b/client/fingerprint/cgroup_linux.go index 36cefdf0f310..a951dca6552a 100644 --- a/client/fingerprint/cgroup_linux.go +++ b/client/fingerprint/cgroup_linux.go @@ -1,5 +1,4 @@ //go:build linux -// +build linux package fingerprint @@ -7,31 +6,30 @@ import ( "fmt" ) -const ( - cgroupAvailable = "available" -) - -// Fingerprint tries to find a valid cgroup mount point +// Fingerprint tries to find a valid cgroup mount point and the version of cgroups +// if a mount-point is present. func (f *CGroupFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error { mount, err := f.mountPointDetector.MountPoint() if err != nil { f.clearCGroupAttributes(resp) - return fmt.Errorf("Failed to discover cgroup mount point: %s", err) + return fmt.Errorf("failed to discover cgroup mount point: %s", err) } - // Check if a cgroup mount point was found + // Check if a cgroup mount point was found. if mount == "" { - f.clearCGroupAttributes(resp) - if f.lastState == cgroupAvailable { - f.logger.Info("cgroups are unavailable") + f.logger.Warn("cgroups are now unavailable") } f.lastState = cgroupUnavailable return nil } - resp.AddAttribute("unique.cgroup.mountpoint", mount) + // Check the version in use. + version := f.versionDetector.CgroupVersion() + + resp.AddAttribute(cgroupMountPointAttribute, mount) + resp.AddAttribute(cgroupVersionAttribute, version) resp.Detected = true if f.lastState == cgroupUnavailable { diff --git a/client/fingerprint/cgroup_test.go b/client/fingerprint/cgroup_test.go index 11119b1d09e7..23205169ab27 100644 --- a/client/fingerprint/cgroup_test.go +++ b/client/fingerprint/cgroup_test.go @@ -1,5 +1,4 @@ //go:build linux -// +build linux package fingerprint @@ -11,6 +10,7 @@ import ( "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) // A fake mount point detector that returns an empty path @@ -41,94 +41,116 @@ func (m *MountPointDetectorEmptyMountPoint) MountPoint() (string, error) { return "", nil } -func TestCGroupFingerprint(t *testing.T) { +// A fake version detector that returns the set version. +type FakeVersionDetector struct { + version string +} + +func (f *FakeVersionDetector) CgroupVersion() string { + return f.version +} + +func newRequest(node *structs.Node) *FingerprintRequest { + return &FingerprintRequest{ + Config: new(config.Config), + Node: node, + } +} + +func newNode() *structs.Node { + return &structs.Node{ + Attributes: make(map[string]string), + } +} + +func TestCgroup_MountPoint(t *testing.T) { ci.Parallel(t) - { + t.Run("mount-point fail", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorMountPointFail{}, - } - - node := &structs.Node{ - Attributes: make(map[string]string), + mountPointDetector: new(MountPointDetectorMountPointFail), + versionDetector: new(DefaultCgroupVersionDetector), } - request := &FingerprintRequest{Config: &config.Config{}, Node: node} + request := newRequest(newNode()) var response FingerprintResponse err := f.Fingerprint(request, &response) - if err == nil { - t.Fatalf("expected an error") - } - - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a != "" { - t.Fatalf("unexpected attribute found, %s", a) - } - } + require.EqualError(t, err, "failed to discover cgroup mount point: cgroup mountpoint discovery failed") + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) - { + t.Run("mount-point available", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorValidMountPoint{}, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), - } - - request := &FingerprintRequest{Config: &config.Config{}, Node: node} + request := newRequest(newNode()) var response FingerprintResponse err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, ok := response.Attributes["unique.cgroup.mountpoint"]; !ok { - t.Fatalf("unable to find attribute: %s", a) - } - } + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", response.Attributes[cgroupMountPointAttribute]) + }) - { + t.Run("mount-point empty", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupUnavailable, - mountPointDetector: &MountPointDetectorEmptyMountPoint{}, + mountPointDetector: new(MountPointDetectorEmptyMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), - } - - request := &FingerprintRequest{Config: &config.Config{}, Node: node} var response FingerprintResponse - err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a != "" { - t.Fatalf("unexpected attribute found, %s", a) - } - } - { + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) + + t.Run("mount-point already present", func(t *testing.T) { f := &CGroupFingerprint{ logger: testlog.HCLogger(t), lastState: cgroupAvailable, - mountPointDetector: &MountPointDetectorValidMountPoint{}, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: new(DefaultCgroupVersionDetector), } - node := &structs.Node{ - Attributes: make(map[string]string), + var response FingerprintResponse + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", response.Attributes[cgroupMountPointAttribute]) + }) +} + +func TestCgroup_Version(t *testing.T) { + t.Run("version v1", func(t *testing.T) { + f := &CGroupFingerprint{ + logger: testlog.HCLogger(t), + lastState: cgroupUnavailable, + mountPointDetector: new(MountPointDetectorValidMountPoint), + versionDetector: &FakeVersionDetector{version: "v1"}, } - request := &FingerprintRequest{Config: &config.Config{}, Node: node} var response FingerprintResponse - err := f.Fingerprint(request, &response) - if err != nil { - t.Fatalf("unexpected error, %s", err) - } - if a, _ := response.Attributes["unique.cgroup.mountpoint"]; a == "" { - t.Fatalf("expected attribute to be found, %s", a) + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Equal(t, "v1", response.Attributes[cgroupVersionAttribute]) + }) + + t.Run("without mount-point", func(t *testing.T) { + f := &CGroupFingerprint{ + logger: testlog.HCLogger(t), + lastState: cgroupUnavailable, + mountPointDetector: new(MountPointDetectorEmptyMountPoint), + versionDetector: &FakeVersionDetector{version: "v1"}, } - } + + var response FingerprintResponse + err := f.Fingerprint(newRequest(newNode()), &response) + require.NoError(t, err) + require.Empty(t, response.Attributes[cgroupMountPointAttribute]) + }) } diff --git a/client/fingerprint/cpu_linux.go b/client/fingerprint/cpu_linux.go index a31b8e60bf67..14c80faa3aa7 100644 --- a/client/fingerprint/cpu_linux.go +++ b/client/fingerprint/cpu_linux.go @@ -5,9 +5,8 @@ import ( ) func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) { - parent := req.Config.CgroupParent - if parent == "" { - parent = cgutil.DefaultCgroupParent - } - return cgutil.GetCPUsFromCgroup(parent) + // The cpuset cgroup manager is initialized (on linux), but not accessible + // from the finger-printer. So we reach in and grab the information manually. + // We may assume the hierarchy is already setup. + return cgutil.GetCPUsFromCgroup(req.Config.CgroupParent) } diff --git a/client/lib/cgutil/cgutil_default.go b/client/lib/cgutil/cgutil_default.go deleted file mode 100644 index c2fc74e0da28..000000000000 --- a/client/lib/cgutil/cgutil_default.go +++ /dev/null @@ -1,14 +0,0 @@ -//go:build !linux -// +build !linux - -package cgutil - -const ( - DefaultCgroupParent = "" -) - -// FindCgroupMountpointDir is used to find the cgroup mount point on a Linux -// system. Here it is a no-op implemtation -func FindCgroupMountpointDir() (string, error) { - return "", nil -} diff --git a/client/lib/cgutil/cgutil_linux.go b/client/lib/cgutil/cgutil_linux.go index e8b867aa1f19..349660ec3e6b 100644 --- a/client/lib/cgutil/cgutil_linux.go +++ b/client/lib/cgutil/cgutil_linux.go @@ -1,121 +1,106 @@ +//go:build linux + package cgutil import ( + "fmt" "os" "path/filepath" - cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper/uuid" "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" - "golang.org/x/sys/unix" -) - -const ( - DefaultCgroupParent = "/nomad" - SharedCpusetCgroupName = "shared" - ReservedCpusetCgroupName = "reserved" + lcc "github.com/opencontainers/runc/libcontainer/configs" ) -func GetCPUsFromCgroup(group string) ([]uint16, error) { - cgroupPath, err := getCgroupPathHelper("cpuset", group) - if err != nil { - return nil, err - } +// UseV2 indicates whether only cgroups.v2 is enabled. If cgroups.v2 is not +// enabled or is running in hybrid mode with cgroups.v1, Nomad will make use of +// cgroups.v1 +// +// This is a read-only value. +var UseV2 = cgroups.IsCgroup2UnifiedMode() + +// GetCgroupParent returns the mount point under the root cgroup in which Nomad +// will create cgroups. If parent is not set, an appropriate name for the version +// of cgroups will be used. +func GetCgroupParent(parent string) string { + if UseV2 { + return getParentV2(parent) + } + return getParentV1(parent) +} - man := cgroupFs.NewManager(&configs.Cgroup{Path: group}, map[string]string{"cpuset": cgroupPath}, false) - stats, err := man.GetStats() - if err != nil { - return nil, err +// CreateCPUSetManager creates a V1 or V2 CpusetManager depending on system configuration. +func CreateCPUSetManager(parent string, logger hclog.Logger) CpusetManager { + if UseV2 { + return NewCpusetManagerV2(getParentV2(parent), logger.Named("cpuset.v2")) } - return stats.CPUSetStats.CPUs, nil + return NewCpusetManagerV1(getParentV1(parent), logger.Named("cpuset.v1")) } -func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { - if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil { - return - } - if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil { - return +// GetCPUsFromCgroup gets the effective cpuset value for the given cgroup. +func GetCPUsFromCgroup(group string) ([]uint16, error) { + if UseV2 { + return getCPUsFromCgroupV2(getParentV2(group)) } - return cpus, mems, nil + return getCPUsFromCgroupV1(getParentV1(group)) } -// cpusetEnsureParent makes sure that the parent directories of current -// are created and populated with the proper cpus and mems files copied -// from their respective parent. It does that recursively, starting from -// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). -func cpusetEnsureParent(current string) error { - var st unix.Statfs_t +// CgroupScope returns the name of the scope for Nomad's managed cgroups for +// the given allocID and task. +// +// e.g. "-.scope" +// +// Only useful for v2. +func CgroupScope(allocID, task string) string { + return fmt.Sprintf("%s.%s.scope", allocID, task) +} - parent := filepath.Dir(current) - err := unix.Statfs(parent, &st) - if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { +// ConfigureBasicCgroups will initialize cgroups for v1. +// +// Not useful in cgroups.v2 +func ConfigureBasicCgroups(config *lcc.Config) error { + if UseV2 { + // In v2 the default behavior is to create inherited interface files for + // all mounted subsystems automatically. return nil } - // Treat non-existing directory as cgroupfs as it will be created, - // and the root cpuset directory obviously exists. - if err != nil && err != unix.ENOENT { - return &os.PathError{Op: "statfs", Path: parent, Err: err} - } - if err := cpusetEnsureParent(parent); err != nil { - return err - } - if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) { - return err - } - return cpusetCopyIfNeeded(current, parent) -} - -// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent -// directory to the current directory if the file's contents are 0 -func cpusetCopyIfNeeded(current, parent string) error { - currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + id := uuid.Generate() + // In v1 we must setup the freezer cgroup ourselves. + subsystem := "freezer" + path, err := GetCgroupPathHelperV1(subsystem, filepath.Join(DefaultCgroupV1Parent, id)) if err != nil { - return err + return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err) } - parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) - if err != nil { + if err = os.MkdirAll(path, 0755); err != nil { return err } - - if isEmptyCpuset(currentCpus) { - if err := fscommon.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { - return err - } - } - if isEmptyCpuset(currentMems) { - if err := fscommon.WriteFile(current, "cpuset.mems", parentMems); err != nil { - return err - } + config.Cgroups.Paths = map[string]string{ + subsystem: path, } return nil } -func isEmptyCpuset(str string) bool { - return str == "" || str == "\n" -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - // FindCgroupMountpointDir is used to find the cgroup mount point on a Linux // system. +// +// Note that in cgroups.v1, this returns one of many subsystems that are mounted. +// e.g. a return value of "/sys/fs/cgroup/systemd" really implies the root is +// "/sys/fs/cgroup", which is interesting on hybrid systems where the 'unified' +// subsystem is mounted as if it were a subsystem, but the actual root is different. +// (i.e. /sys/fs/cgroup/unified). +// +// As far as Nomad is concerned, UseV2 is the source of truth for which hierarchy +// to use, and that will only be a true value if cgroups.v2 is mounted on +// /sys/fs/cgroup (i.e. system is not in v1 or hybrid mode). +// +// ➜ mount -l | grep cgroup +// tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,mode=755,inode64) +// cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate) +// cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) +// cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory) +// (etc.) func FindCgroupMountpointDir() (string, error) { mount, err := cgroups.GetCgroupMounts(false) if err != nil { @@ -127,3 +112,18 @@ func FindCgroupMountpointDir() (string, error) { } return mount[0].Mountpoint, nil } + +// CopyCpuset copies the cpuset.cpus value from source into destination. +func CopyCpuset(source, destination string) error { + correct, err := cgroups.ReadFile(source, "cpuset.cpus") + if err != nil { + return err + } + + err = cgroups.WriteFile(destination, "cpuset.cpus", correct) + if err != nil { + return err + } + + return nil +} diff --git a/client/lib/cgutil/cgutil_linux_test.go b/client/lib/cgutil/cgutil_linux_test.go new file mode 100644 index 000000000000..bbc18ef8c3e9 --- /dev/null +++ b/client/lib/cgutil/cgutil_linux_test.go @@ -0,0 +1,124 @@ +//go:build linux + +package cgutil + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/stretchr/testify/require" +) + +func TestUtil_GetCgroupParent(t *testing.T) { + ci.Parallel(t) + + t.Run("v1", func(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + t.Run("default", func(t *testing.T) { + exp := "/nomad" + parent := GetCgroupParent("") + require.Equal(t, exp, parent) + }) + + t.Run("configured", func(t *testing.T) { + exp := "/bar" + parent := GetCgroupParent("/bar") + require.Equal(t, exp, parent) + }) + }) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + t.Run("default", func(t *testing.T) { + exp := "nomad.slice" + parent := GetCgroupParent("") + require.Equal(t, exp, parent) + }) + + t.Run("configured", func(t *testing.T) { + exp := "abc.slice" + parent := GetCgroupParent("abc.slice") + require.Equal(t, exp, parent) + }) + }) +} + +func TestUtil_CreateCPUSetManager(t *testing.T) { + ci.Parallel(t) + + logger := testlog.HCLogger(t) + + t.Run("v1", func(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + parent := "/" + uuid.Short() + manager := CreateCPUSetManager(parent, logger) + err := manager.Init([]uint16{0}) + require.NoError(t, err) + require.NoError(t, cgroups.RemovePath(filepath.Join(CgroupRoot, parent))) + }) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + parent := uuid.Short() + ".slice" + manager := CreateCPUSetManager(parent, logger) + err := manager.Init([]uint16{0}) + require.NoError(t, err) + require.NoError(t, cgroups.RemovePath(filepath.Join(CgroupRoot, parent))) + }) +} + +func TestUtil_GetCPUsFromCgroup(t *testing.T) { + ci.Parallel(t) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + cpus, err := GetCPUsFromCgroup("system.slice") // thanks, systemd! + require.NoError(t, err) + require.NotEmpty(t, cpus) + }) +} + +func create(t *testing.T, name string) { + mgr, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, name), rootless) + require.NoError(t, err) + err = mgr.Apply(CreationPID) + require.NoError(t, err) +} + +func cleanup(t *testing.T, name string) { + err := cgroups.RemovePath(name) + require.NoError(t, err) +} + +func TestUtil_CopyCpuset(t *testing.T) { + ci.Parallel(t) + + t.Run("v2", func(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + source := uuid.Short() + ".scope" + create(t, source) + defer cleanup(t, source) + require.NoError(t, cgroups.WriteFile(filepath.Join(CgroupRoot, source), "cpuset.cpus", "0-1")) + + destination := uuid.Short() + ".scope" + create(t, destination) + defer cleanup(t, destination) + + err := CopyCpuset( + filepath.Join(CgroupRoot, source), + filepath.Join(CgroupRoot, destination), + ) + require.NoError(t, err) + + value, readErr := cgroups.ReadFile(filepath.Join(CgroupRoot, destination), "cpuset.cpus") + require.NoError(t, readErr) + require.Equal(t, "0-1", strings.TrimSpace(value)) + }) +} diff --git a/client/lib/cgutil/cgutil_noop.go b/client/lib/cgutil/cgutil_noop.go new file mode 100644 index 000000000000..91a35b14492e --- /dev/null +++ b/client/lib/cgutil/cgutil_noop.go @@ -0,0 +1,42 @@ +//go:build !linux + +package cgutil + +import ( + "github.com/hashicorp/go-hclog" +) + +const ( + // DefaultCgroupParent does not apply to non-Linux operating systems. + DefaultCgroupParent = "" +) + +// UseV2 is always false on non-Linux systems. +// +// This is a read-only value. +var UseV2 = false + +// CreateCPUSetManager creates a no-op CpusetManager for non-Linux operating systems. +func CreateCPUSetManager(string, hclog.Logger) CpusetManager { + return new(NoopCpusetManager) +} + +// FindCgroupMountpointDir returns nothing for non-Linux operating systems. +func FindCgroupMountpointDir() (string, error) { + return "", nil +} + +// GetCgroupParent returns nothing for non-Linux operating systems. +func GetCgroupParent(string) string { + return DefaultCgroupParent +} + +// GetCPUsFromCgroup returns nothing for non-Linux operating systems. +func GetCPUsFromCgroup(string) ([]uint16, error) { + return nil, nil +} + +// CgroupScope returns nothing for non-Linux operating systems. +func CgroupScope(allocID, task string) string { + return "" +} diff --git a/client/lib/cgutil/cpuset_manager.go b/client/lib/cgutil/cpuset_manager.go index 809af90fcbf1..7d16c752f84c 100644 --- a/client/lib/cgutil/cpuset_manager.go +++ b/client/lib/cgutil/cpuset_manager.go @@ -2,18 +2,26 @@ package cgutil import ( "context" + "fmt" + "path/filepath" + "strings" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/structs" ) -// CpusetManager is used to setup cpuset cgroups for each task. A pool of shared cpus is managed for -// tasks which don't require any reserved cores and a cgroup is managed seperetly for each task which -// require reserved cores. +const ( + // CgroupRoot is hard-coded in the cgroups specification. + // It only applies to linux but helpers have references to it in driver(s). + CgroupRoot = "/sys/fs/cgroup" +) + +// CpusetManager is used to setup cpuset cgroups for each task. type CpusetManager interface { - // Init should be called before any tasks are managed to ensure the cgroup parent exists and - // check that proper permissions are granted to manage cgroups. - Init() error + // Init should be called with the initial set of reservable cores before any + // allocations are managed. Ensures the parent cgroup exists and proper permissions + // are available for managing cgroups. + Init([]uint16) error // AddAlloc adds an allocation to the manager AddAlloc(alloc *structs.Allocation) @@ -26,8 +34,26 @@ type CpusetManager interface { CgroupPathFor(allocID, taskName string) CgroupPathGetter } -// CgroupPathGetter is a function which returns the cgroup path and any error which ocured during cgroup initialization. -// It should block until the cgroup has been created or an error is reported +type NoopCpusetManager struct{} + +func (n NoopCpusetManager) Init([]uint16) error { + return nil +} + +func (n NoopCpusetManager) AddAlloc(alloc *structs.Allocation) { +} + +func (n NoopCpusetManager) RemoveAlloc(allocID string) { +} + +func (n NoopCpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { + return func(context.Context) (string, error) { return "", nil } +} + +// CgroupPathGetter is a function which returns the cgroup path and any error which +// occurred during cgroup initialization. +// +// It should block until the cgroup has been created or an error is reported. type CgroupPathGetter func(context.Context) (path string, err error) type TaskCgroupInfo struct { @@ -37,20 +63,25 @@ type TaskCgroupInfo struct { Error error } -func NoopCpusetManager() CpusetManager { return noopCpusetManager{} } - -type noopCpusetManager struct{} +// identity is the "." string that uniquely identifies an +// individual instance of a task within the flat cgroup namespace +type identity string -func (n noopCpusetManager) Init() error { - return nil -} - -func (n noopCpusetManager) AddAlloc(alloc *structs.Allocation) { +func makeID(allocID, task string) identity { + return identity(fmt.Sprintf("%s.%s", allocID, task)) } -func (n noopCpusetManager) RemoveAlloc(allocID string) { +func makeScope(id identity) string { + return string(id) + ".scope" } -func (n noopCpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { - return func(context.Context) (string, error) { return "", nil } +// SplitPath determines the parent and cgroup from p. +// p must contain at least 2 elements (parent + cgroup). +// +// Handles the cgroup root if present. +func SplitPath(p string) (string, string) { + p = strings.TrimPrefix(p, CgroupRoot) + p = strings.Trim(p, "/") + parts := strings.Split(p, "/") + return parts[0], "/" + filepath.Join(parts[1:]...) } diff --git a/client/lib/cgutil/cpuset_manager_default.go b/client/lib/cgutil/cpuset_manager_default.go deleted file mode 100644 index 1f8c077baa6d..000000000000 --- a/client/lib/cgutil/cpuset_manager_default.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build !linux -// +build !linux - -package cgutil - -import ( - "github.com/hashicorp/go-hclog" -) - -func NewCpusetManager(_ string, _ hclog.Logger) CpusetManager { return noopCpusetManager{} } diff --git a/client/lib/cgutil/cpuset_manager_test.go b/client/lib/cgutil/cpuset_manager_test.go new file mode 100644 index 000000000000..88f661411d26 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_test.go @@ -0,0 +1,28 @@ +package cgutil + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/stretchr/testify/require" +) + +func TestUtil_SplitPath(t *testing.T) { + ci.Parallel(t) + + try := func(input, expParent, expCgroup string) { + parent, cgroup := SplitPath(input) + require.Equal(t, expParent, parent) + require.Equal(t, expCgroup, cgroup) + } + + // foo, /bar + try("foo/bar", "foo", "/bar") + try("/foo/bar/", "foo", "/bar") + try("/sys/fs/cgroup/foo/bar", "foo", "/bar") + + // foo, /bar/baz + try("/foo/bar/baz/", "foo", "/bar/baz") + try("foo/bar/baz", "foo", "/bar/baz") + try("/sys/fs/cgroup/foo/bar/baz", "foo", "/bar/baz") +} diff --git a/client/lib/cgutil/cpuset_manager_linux.go b/client/lib/cgutil/cpuset_manager_v1.go similarity index 59% rename from client/lib/cgutil/cpuset_manager_linux.go rename to client/lib/cgutil/cpuset_manager_v1.go index ba583c043c22..0e910705290e 100644 --- a/client/lib/cgutil/cpuset_manager_linux.go +++ b/client/lib/cgutil/cpuset_manager_v1.go @@ -1,3 +1,5 @@ +//go:build linux + package cgutil import ( @@ -10,21 +12,28 @@ import ( "sync" "time" - "github.com/opencontainers/runc/libcontainer/cgroups" - cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/structs" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +const ( + DefaultCgroupV1Parent = "/nomad" + SharedCpusetCgroupName = "shared" + ReservedCpusetCgroupName = "reserved" ) -func NewCpusetManager(cgroupParent string, logger hclog.Logger) CpusetManager { +// NewCpusetManagerV1 creates a CpusetManager compatible with cgroups.v1 +func NewCpusetManagerV1(cgroupParent string, logger hclog.Logger) CpusetManager { if cgroupParent == "" { - cgroupParent = DefaultCgroupParent + cgroupParent = DefaultCgroupV1Parent } - return &cpusetManager{ + return &cpusetManagerV1{ cgroupParent: cgroupParent, cgroupInfo: map[string]allocTaskCgroupInfo{}, logger: logger, @@ -35,7 +44,7 @@ var ( cpusetReconcileInterval = 30 * time.Second ) -type cpusetManager struct { +type cpusetManagerV1 struct { // cgroupParent relative to the cgroup root. ex. '/nomad' cgroupParent string // cgroupParentPath is the absolute path to the cgroup parent. @@ -53,7 +62,7 @@ type cpusetManager struct { logger hclog.Logger } -func (c *cpusetManager) AddAlloc(alloc *structs.Allocation) { +func (c *cpusetManagerV1) AddAlloc(alloc *structs.Allocation) { if alloc == nil || alloc.AllocatedResources == nil { return } @@ -77,14 +86,14 @@ func (c *cpusetManager) AddAlloc(alloc *structs.Allocation) { go c.signalReconcile() } -func (c *cpusetManager) RemoveAlloc(allocID string) { +func (c *cpusetManagerV1) RemoveAlloc(allocID string) { c.mu.Lock() delete(c.cgroupInfo, allocID) c.mu.Unlock() go c.signalReconcile() } -func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { +func (c *cpusetManagerV1) CgroupPathFor(allocID, task string) CgroupPathGetter { return func(ctx context.Context) (string, error) { c.mu.Lock() allocInfo, ok := c.cgroupInfo[allocID] @@ -99,15 +108,21 @@ func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { return "", fmt.Errorf("task %q not found", task) } + timer, stop := helper.NewSafeTimer(0) + defer stop() + for { + if taskInfo.Error != nil { break } + + timer.Reset(100 * time.Millisecond) if _, err := os.Stat(taskInfo.CgroupPath); os.IsNotExist(err) { select { case <-ctx.Done(): return taskInfo.CgroupPath, ctx.Err() - case <-time.After(100 * time.Millisecond): + case <-timer.C: continue } } @@ -119,24 +134,25 @@ func (c *cpusetManager) CgroupPathFor(allocID, task string) CgroupPathGetter { } +// task name -> task cgroup info type allocTaskCgroupInfo map[string]*TaskCgroupInfo // Init checks that the cgroup parent and expected child cgroups have been created // If the cgroup parent is set to /nomad then this will ensure that the /nomad/shared // cgroup is initialized. -func (c *cpusetManager) Init() error { - cgroupParentPath, err := getCgroupPathHelper("cpuset", c.cgroupParent) +func (c *cpusetManagerV1) Init(_ []uint16) error { + cgroupParentPath, err := GetCgroupPathHelperV1("cpuset", c.cgroupParent) if err != nil { return err } c.cgroupParentPath = cgroupParentPath // ensures that shared cpuset exists and that the cpuset values are copied from the parent if created - if err := cpusetEnsureParent(filepath.Join(cgroupParentPath, SharedCpusetCgroupName)); err != nil { + if err := cpusetEnsureParentV1(filepath.Join(cgroupParentPath, SharedCpusetCgroupName)); err != nil { return err } - parentCpus, parentMems, err := getCpusetSubsystemSettings(cgroupParentPath) + parentCpus, parentMems, err := getCpusetSubsystemSettingsV1(cgroupParentPath) if err != nil { return fmt.Errorf("failed to detect parent cpuset settings: %v", err) } @@ -155,7 +171,7 @@ func (c *cpusetManager) Init() error { return err } - if err := fscommon.WriteFile(filepath.Join(cgroupParentPath, ReservedCpusetCgroupName), "cpuset.mems", parentMems); err != nil { + if err := cgroups.WriteFile(filepath.Join(cgroupParentPath, ReservedCpusetCgroupName), "cpuset.mems", parentMems); err != nil { return err } @@ -168,7 +184,7 @@ func (c *cpusetManager) Init() error { return nil } -func (c *cpusetManager) reconcileLoop() { +func (c *cpusetManagerV1) reconcileLoop() { timer := time.NewTimer(0) if !timer.Stop() { <-timer.C @@ -189,7 +205,7 @@ func (c *cpusetManager) reconcileLoop() { } } -func (c *cpusetManager) reconcileCpusets() { +func (c *cpusetManagerV1) reconcileCpusets() { c.mu.Lock() defer c.mu.Unlock() sharedCpuset := cpuset.New(c.parentCpuset.ToSlice()...) @@ -240,13 +256,13 @@ func (c *cpusetManager) reconcileCpusets() { } // copy cpuset.mems from parent - _, parentMems, err := getCpusetSubsystemSettings(filepath.Dir(info.CgroupPath)) + _, parentMems, err := getCpusetSubsystemSettingsV1(filepath.Dir(info.CgroupPath)) if err != nil { c.logger.Error("failed to read parent cgroup settings for task", "path", info.CgroupPath, "error", err) info.Error = err continue } - if err := fscommon.WriteFile(info.CgroupPath, "cpuset.mems", parentMems); err != nil { + if err := cgroups.WriteFile(info.CgroupPath, "cpuset.mems", parentMems); err != nil { c.logger.Error("failed to write cgroup cpuset.mems setting for task", "path", info.CgroupPath, "mems", parentMems, "error", err) info.Error = err continue @@ -260,30 +276,30 @@ func (c *cpusetManager) reconcileCpusets() { } // setCgroupCpusetCPUs will compare an existing cpuset.cpus value with an expected value, overwriting the existing if different -// must hold a lock on cpusetManager.mu before calling -func (_ *cpusetManager) setCgroupCpusetCPUs(path, cpus string) error { - currentCpusRaw, err := fscommon.ReadFile(path, "cpuset.cpus") +// must hold a lock on cpusetManagerV1.mu before calling +func (_ *cpusetManagerV1) setCgroupCpusetCPUs(path, cpus string) error { + currentCpusRaw, err := cgroups.ReadFile(path, "cpuset.cpus") if err != nil { return err } if cpus != strings.TrimSpace(currentCpusRaw) { - if err := fscommon.WriteFile(path, "cpuset.cpus", cpus); err != nil { + if err := cgroups.WriteFile(path, "cpuset.cpus", cpus); err != nil { return err } } return nil } -func (c *cpusetManager) signalReconcile() { +func (c *cpusetManagerV1) signalReconcile() { select { case c.signalCh <- struct{}{}: case <-c.doneCh: } } -func (c *cpusetManager) getCpuset(group string) (cpuset.CPUSet, error) { - man := cgroupFs.NewManager( +func (c *cpusetManagerV1) getCpuset(group string) (cpuset.CPUSet, error) { + man := fs.NewManager( &configs.Cgroup{ Path: filepath.Join(c.cgroupParent, group), }, @@ -297,15 +313,119 @@ func (c *cpusetManager) getCpuset(group string) (cpuset.CPUSet, error) { return cpuset.New(stats.CPUSetStats.CPUs...), nil } -func (c *cpusetManager) getCgroupPathsForTask(allocID, task string) (absolute, relative string) { +func (c *cpusetManagerV1) getCgroupPathsForTask(allocID, task string) (absolute, relative string) { return filepath.Join(c.reservedCpusetPath(), fmt.Sprintf("%s-%s", allocID, task)), filepath.Join(c.cgroupParent, ReservedCpusetCgroupName, fmt.Sprintf("%s-%s", allocID, task)) } -func (c *cpusetManager) sharedCpusetPath() string { +func (c *cpusetManagerV1) sharedCpusetPath() string { return filepath.Join(c.cgroupParentPath, SharedCpusetCgroupName) } -func (c *cpusetManager) reservedCpusetPath() string { +func (c *cpusetManagerV1) reservedCpusetPath() string { return filepath.Join(c.cgroupParentPath, ReservedCpusetCgroupName) } + +func getCPUsFromCgroupV1(group string) ([]uint16, error) { + cgroupPath, err := GetCgroupPathHelperV1("cpuset", group) + if err != nil { + return nil, err + } + + man := fs.NewManager(&configs.Cgroup{Path: group}, map[string]string{"cpuset": cgroupPath}, false) + stats, err := man.GetStats() + if err != nil { + return nil, err + } + return stats.CPUSetStats.CPUs, nil +} + +func getParentV1(parent string) string { + if parent == "" { + return DefaultCgroupV1Parent + } + return parent +} + +// cpusetEnsureParentV1 makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParentV1(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParentV1(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeededV1(current, parent) +} + +// cpusetCopyIfNeededV1 copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeededV1(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettingsV1(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettingsV1(parent) + if err != nil { + return err + } + + if isEmptyCpusetV1(currentCpus) { + if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { + return err + } + } + if isEmptyCpusetV1(currentMems) { + if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { + return err + } + } + return nil +} + +func getCpusetSubsystemSettingsV1(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +func isEmptyCpusetV1(str string) bool { + return str == "" || str == "\n" +} + +func GetCgroupPathHelperV1(subsystem, cgroup string) (string, error) { + mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + result := filepath.Join(mnt, relCgroup) + return result, nil +} diff --git a/client/lib/cgutil/cpuset_manager_linux_test.go b/client/lib/cgutil/cpuset_manager_v1_test.go similarity index 80% rename from client/lib/cgutil/cpuset_manager_linux_test.go rename to client/lib/cgutil/cpuset_manager_v1_test.go index e2bb00679c36..3ada7b551a44 100644 --- a/client/lib/cgutil/cpuset_manager_linux_test.go +++ b/client/lib/cgutil/cpuset_manager_v1_test.go @@ -1,51 +1,48 @@ +//go:build linux + package cgutil import ( "io/ioutil" "path/filepath" - "runtime" - "syscall" "testing" + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/lib/cpuset" "github.com/hashicorp/nomad/nomad/mock" "github.com/opencontainers/runc/libcontainer/cgroups" - - "github.com/hashicorp/nomad/helper/uuid" - "github.com/stretchr/testify/require" - - "github.com/hashicorp/nomad/helper/testlog" ) -func tmpCpusetManager(t *testing.T) (manager *cpusetManager, cleanup func()) { - if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { - t.Skip("Test only available running as root on linux") - } +func tmpCpusetManagerV1(t *testing.T) (manager *cpusetManagerV1, cleanup func()) { mount, err := FindCgroupMountpointDir() if err != nil || mount == "" { t.Skipf("Failed to find cgroup mount: %v %v", mount, err) } parent := "/gotest-" + uuid.Short() - require.NoError(t, cpusetEnsureParent(parent)) + require.NoError(t, cpusetEnsureParentV1(parent)) - manager = &cpusetManager{ + manager = &cpusetManagerV1{ cgroupParent: parent, cgroupInfo: map[string]allocTaskCgroupInfo{}, logger: testlog.HCLogger(t), } - parentPath, err := getCgroupPathHelper("cpuset", parent) + parentPath, err := GetCgroupPathHelperV1("cpuset", parent) require.NoError(t, err) return manager, func() { require.NoError(t, cgroups.RemovePaths(map[string]string{"cpuset": parentPath})) } } -func TestCpusetManager_Init(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) +func TestCpusetManager_V1_Init(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) + require.NoError(t, manager.Init(nil)) require.DirExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName)) require.FileExists(t, filepath.Join(manager.cgroupParentPath, SharedCpusetCgroupName, "cpuset.cpus")) @@ -57,10 +54,12 @@ func TestCpusetManager_Init(t *testing.T) { require.DirExists(t, filepath.Join(manager.cgroupParentPath, ReservedCpusetCgroupName)) } -func TestCpusetManager_AddAlloc_single(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) +func TestCpusetManager_V1_AddAlloc_single(t *testing.T) { + testutil.CgroupsCompatibleV1(t) + + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) + require.NoError(t, manager.Init(nil)) alloc := mock.Alloc() // reserve just one core (the 0th core, which probably exists) @@ -104,26 +103,19 @@ func TestCpusetManager_AddAlloc_single(t *testing.T) { require.Exactly(t, alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores, taskCpus.ToSlice()) } -func TestCpusetManager_AddAlloc_subset(t *testing.T) { - t.Skip("todo: add test for #11933") -} +func TestCpusetManager_V1_RemoveAlloc(t *testing.T) { + testutil.CgroupsCompatibleV1(t) -func TestCpusetManager_AddAlloc_all(t *testing.T) { - // cgroupsv2 changes behavior of writing empty cpuset.cpu, which is what - // happens to the /shared group when one or more allocs consume all available - // cores. - t.Skip("todo: add test for #11933") -} - -func TestCpusetManager_RemoveAlloc(t *testing.T) { - manager, cleanup := tmpCpusetManager(t) + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() - require.NoError(t, manager.Init()) - - // this case tests adding 2 allocs, reconciling then removing 1 alloc - // it requires the system to have atleast 2 cpu cores (one for each alloc) - if manager.parentCpuset.Size() < 2 { - t.Skip("test requires atleast 2 cpu cores") + require.NoError(t, manager.Init(nil)) + + // This case tests adding 2 allocs, reconciling then removing 1 alloc + // it requires the system to have at least 3 cpu cores (one for each alloc), + // BUT plus another one because writing an empty cpuset causes the cgroup to + // inherit the parent. + if manager.parentCpuset.Size() < 3 { + t.Skip("test requires at least 3 cpu cores") } alloc1 := mock.Alloc() diff --git a/client/lib/cgutil/cpuset_manager_v2.go b/client/lib/cgutil/cpuset_manager_v2.go new file mode 100644 index 000000000000..00162ca523a5 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_v2.go @@ -0,0 +1,332 @@ +//go:build linux + +package cgutil + +import ( + "context" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/lib/cpuset" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + // CreationPID is a special PID in libcontainer used to denote a cgroup + // should be created, but with no process added. + // + // https://github.com/opencontainers/runc/blob/v1.0.3/libcontainer/cgroups/utils.go#L372 + CreationPID = -1 + + // DefaultCgroupParentV2 is the name of Nomad's default parent cgroup, under which + // all other cgroups are managed. This can be changed with client configuration + // in case for e.g. Nomad tasks should be further constrained by an externally + // configured systemd cgroup. + DefaultCgroupParentV2 = "nomad.slice" + + // rootless is (for now) always false; Nomad clients require root, so we + // assume to not need to do the extra plumbing for rootless cgroups. + rootless = false +) + +// nothing is used for treating a map like a set with no values +type nothing struct{} + +// null represents nothing +var null = nothing{} + +type cpusetManagerV2 struct { + logger hclog.Logger + + parent string // relative to cgroup root (e.g. "nomad.slice") + parentAbs string // absolute path (e.g. "/sys/fs/cgroup/nomad.slice") + initial cpuset.CPUSet // set of initial cores (never changes) + + lock sync.Mutex // hold this when managing pool / sharing / isolating + pool cpuset.CPUSet // pool of cores being shared among all tasks + sharing map[identity]nothing // sharing tasks using cores only from the pool + isolating map[identity]cpuset.CPUSet // isolating tasks using cores from the pool + reserved cores +} + +func NewCpusetManagerV2(parent string, logger hclog.Logger) CpusetManager { + cgroupParent := getParentV2(parent) + return &cpusetManagerV2{ + parent: cgroupParent, + parentAbs: filepath.Join(CgroupRoot, cgroupParent), + logger: logger, + sharing: make(map[identity]nothing), + isolating: make(map[identity]cpuset.CPUSet), + } +} + +func (c *cpusetManagerV2) Init(cores []uint16) error { + c.logger.Debug("initializing with", "cores", cores) + if err := c.ensureParent(); err != nil { + c.logger.Error("failed to init cpuset manager", "err", err) + return err + } + c.initial = cpuset.New(cores...) + return nil +} + +func (c *cpusetManagerV2) AddAlloc(alloc *structs.Allocation) { + if alloc == nil || alloc.AllocatedResources == nil { + return + } + c.logger.Trace("add allocation", "name", alloc.Name, "id", alloc.ID) + + // grab write lock while we recompute and apply changes + c.lock.Lock() + defer c.lock.Unlock() + + // first update our tracking of isolating and sharing tasks + for task, resources := range alloc.AllocatedResources.Tasks { + id := makeID(alloc.ID, task) + if len(resources.Cpu.ReservedCores) > 0 { + c.isolating[id] = cpuset.New(resources.Cpu.ReservedCores...) + } else { + c.sharing[id] = null + } + } + + // recompute the available sharable cpu cores + c.recalculate() + + // now write out the entire cgroups space + c.reconcile() + + // no need to cleanup on adds, we did not remove a task +} + +func (c *cpusetManagerV2) RemoveAlloc(allocID string) { + c.logger.Trace("remove allocation", "id", allocID) + + // grab write lock while we recompute and apply changes. + c.lock.Lock() + defer c.lock.Unlock() + + // remove tasks of allocID from the sharing set + for id := range c.sharing { + if strings.HasPrefix(string(id), allocID) { + delete(c.sharing, id) + } + } + + // remove tasks of allocID from the isolating set + for id := range c.isolating { + if strings.HasPrefix(string(id), allocID) { + delete(c.isolating, id) + } + } + + // recompute available sharable cpu cores + c.recalculate() + + // now write out the entire cgroups space + c.reconcile() + + // now remove any tasks no longer running + c.cleanup() +} + +func (c *cpusetManagerV2) CgroupPathFor(allocID, task string) CgroupPathGetter { + // The CgroupPathFor implementation must block until cgroup for allocID.task + // exists [and can accept a PID]. + return func(ctx context.Context) (string, error) { + ticks, cancel := helper.NewSafeTimer(100 * time.Millisecond) + defer cancel() + + for { + path := c.pathOf(makeID(allocID, task)) + mgr, err := fs2.NewManager(nil, path, rootless) + if err != nil { + return "", err + } + + if mgr.Exists() { + return path, nil + } + + select { + case <-ctx.Done(): + return "", ctx.Err() + case <-ticks.C: + continue + } + } + } +} + +// recalculate the number of cores sharable by non-isolating tasks (and isolating tasks) +// +// must be called while holding c.lock +func (c *cpusetManagerV2) recalculate() { + remaining := c.initial.Copy() + for _, set := range c.isolating { + remaining = remaining.Difference(set) + } + c.pool = remaining +} + +// reconcile will actually write the cpuset values for all tracked tasks. +// +// must be called while holding c.lock +func (c *cpusetManagerV2) reconcile() { + for id := range c.sharing { + c.write(id, c.pool) + } + + for id, set := range c.isolating { + c.write(id, c.pool.Union(set)) + } +} + +// cleanup will remove any cgroups for allocations no longer being tracked +// +// must be called while holding c.lock +func (c *cpusetManagerV2) cleanup() { + // create a map to lookup ids we know about + size := len(c.sharing) + len(c.isolating) + ids := make(map[identity]nothing, size) + for id := range c.sharing { + ids[id] = null + } + for id := range c.isolating { + ids[id] = null + } + + if err := filepath.WalkDir(c.parentAbs, func(path string, entry os.DirEntry, err error) error { + // a cgroup is a directory + if !entry.IsDir() { + return nil + } + + dir := filepath.Dir(path) + base := filepath.Base(path) + + // only manage scopes directly under nomad.slice + if dir != c.parentAbs || !strings.HasSuffix(base, ".scope") { + return nil + } + + // only remove the scope if we do not track it + id := identity(strings.TrimSuffix(base, ".scope")) + _, exists := ids[id] + if !exists { + c.remove(path) + } + + return nil + }); err != nil { + c.logger.Error("failed to cleanup cgroup", "err", err) + } +} + +//pathOf returns the absolute path to a task with identity id. +func (c *cpusetManagerV2) pathOf(id identity) string { + return filepath.Join(c.parentAbs, makeScope(id)) +} + +// remove does the actual fs delete of the cgroup +// +// We avoid removing a cgroup if it still contains a PID, as the cpuset manager +// may be initially empty on a Nomad client restart. +func (c *cpusetManagerV2) remove(path string) { + mgr, err := fs2.NewManager(nil, path, rootless) + if err != nil { + c.logger.Warn("failed to create manager", "path", path, "err", err) + return + } + + // get the list of pids managed by this scope (should be 0 or 1) + pids, _ := mgr.GetPids() + + // do not destroy the scope if a PID is still present + // this is a normal condition when an agent restarts with running tasks + // and the v2 manager is still rebuilding its tracked tasks + if len(pids) > 0 { + return + } + + // remove the cgroup + if err3 := mgr.Destroy(); err3 != nil { + c.logger.Warn("failed to cleanup cgroup", "path", path, "err", err) + return + } +} + +// write does the actual write of cpuset set for cgroup id +func (c *cpusetManagerV2) write(id identity, set cpuset.CPUSet) { + path := c.pathOf(id) + + // make a manager for the cgroup + m, err := fs2.NewManager(nil, path, rootless) + if err != nil { + c.logger.Error("failed to manage cgroup", "path", path, "err", err) + } + + // create the cgroup + if err = m.Apply(CreationPID); err != nil { + c.logger.Error("failed to apply cgroup", "path", path, "err", err) + } + + // set the cpuset value for the cgroup + if err = m.Set(&configs.Resources{ + CpusetCpus: set.String(), + }); err != nil { + c.logger.Error("failed to set cgroup", "path", path, "err", err) + } +} + +// ensureParentCgroup will create parent cgroup for the manager if it does not +// exist yet. No PIDs are added to any cgroup yet. +func (c *cpusetManagerV2) ensureParent() error { + mgr, err := fs2.NewManager(nil, c.parentAbs, rootless) + if err != nil { + return err + } + + if err = mgr.Apply(CreationPID); err != nil { + return err + } + + c.logger.Trace("establish cgroup hierarchy", "parent", c.parent) + return nil +} + +// fromRoot returns the joined filepath of group on the CgroupRoot +func fromRoot(group string) string { + return filepath.Join(CgroupRoot, group) +} + +// getCPUsFromCgroupV2 retrieves the effective cpuset for the group, which must +// be directly under the cgroup root (i.e. the parent, like nomad.slice). +func getCPUsFromCgroupV2(group string) ([]uint16, error) { + path := fromRoot(group) + effective, err := cgroups.ReadFile(path, "cpuset.cpus.effective") + if err != nil { + return nil, err + } + set, err := cpuset.Parse(effective) + if err != nil { + return nil, err + } + return set.ToSlice(), nil +} + +// getParentV2 returns parent if set, otherwise the default name of Nomad's +// parent cgroup (i.e. nomad.slice). +func getParentV2(parent string) string { + if parent == "" { + return DefaultCgroupParentV2 + } + return parent +} diff --git a/client/lib/cgutil/cpuset_manager_v2_test.go b/client/lib/cgutil/cpuset_manager_v2_test.go new file mode 100644 index 000000000000..e1d658c82589 --- /dev/null +++ b/client/lib/cgutil/cpuset_manager_v2_test.go @@ -0,0 +1,90 @@ +//go:build linux + +package cgutil + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/lib/cpuset" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/stretchr/testify/require" +) + +// Note: these tests need to run on GitHub Actions runners with only 2 cores. +// It is not possible to write more cores to a cpuset than are actually available, +// so make sure tests account for that by setting systemCores as the full set of +// usable cores. +var systemCores = []uint16{0, 1} + +func TestCpusetManager_V2_AddAlloc(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + + logger := testlog.HCLogger(t) + parent := uuid.Short() + ".scope" + create(t, parent) + cleanup(t, parent) + + // setup the cpuset manager + manager := NewCpusetManagerV2(parent, logger) + require.NoError(t, manager.Init(systemCores)) + + // add our first alloc, isolating 1 core + t.Run("first", func(t *testing.T) { + alloc := mock.Alloc() + alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(0).ToSlice() + manager.AddAlloc(alloc) + cpusetIs(t, "0-1", parent, alloc.ID, "web") + }) + + // add second alloc, isolating 1 core + t.Run("second", func(t *testing.T) { + alloc := mock.Alloc() + alloc.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(1).ToSlice() + manager.AddAlloc(alloc) + cpusetIs(t, "1", parent, alloc.ID, "web") + }) + + // note that the scheduler, not the cpuset manager, is what prevents over-subscription + // and as such no logic exists here to prevent that +} + +func cpusetIs(t *testing.T, exp, parent, allocID, task string) { + scope := makeScope(makeID(allocID, task)) + value, err := cgroups.ReadFile(filepath.Join(CgroupRoot, parent, scope), "cpuset.cpus") + require.NoError(t, err) + require.Equal(t, exp, strings.TrimSpace(value)) +} + +func TestCpusetManager_V2_RemoveAlloc(t *testing.T) { + testutil.CgroupsCompatibleV2(t) + + logger := testlog.HCLogger(t) + parent := uuid.Short() + ".scope" + create(t, parent) + cleanup(t, parent) + + // setup the cpuset manager + manager := NewCpusetManagerV2(parent, logger) + require.NoError(t, manager.Init(systemCores)) + + // alloc1 gets core 0 + alloc1 := mock.Alloc() + alloc1.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(0).ToSlice() + manager.AddAlloc(alloc1) + + // alloc2 gets core 1 + alloc2 := mock.Alloc() + alloc2.AllocatedResources.Tasks["web"].Cpu.ReservedCores = cpuset.New(1).ToSlice() + manager.AddAlloc(alloc2) + cpusetIs(t, "1", parent, alloc2.ID, "web") + + // with alloc1 gone, alloc2 gets the now shared core + manager.RemoveAlloc(alloc1.ID) + cpusetIs(t, "0-1", parent, alloc2.ID, "web") +} diff --git a/client/testutil/driver_compatible.go b/client/testutil/driver_compatible.go index ff0ecc747f34..2459f8aa6c86 100644 --- a/client/testutil/driver_compatible.go +++ b/client/testutil/driver_compatible.go @@ -5,18 +5,18 @@ import ( "runtime" "syscall" "testing" - - "github.com/hashicorp/nomad/client/lib/cgutil" ) -// RequireRoot skips tests unless running on a Unix as root. +// RequireRoot skips tests unless: +// - running as root func RequireRoot(t *testing.T) { if syscall.Geteuid() != 0 { - t.Skip("Must run as root on Unix") + t.Skip("Test requires root") } } -// RequireConsul skips tests unless a Consul binary is available on $PATH. +// RequireConsul skips tests unless: +// - "consul" executable is detected on $PATH func RequireConsul(t *testing.T) { _, err := exec.Command("consul", "version").CombinedOutput() if err != nil { @@ -24,7 +24,8 @@ func RequireConsul(t *testing.T) { } } -// RequireVault skips tests unless a Vault binary is available on $PATH. +// RequireVault skips tests unless: +// - "vault" executable is detected on $PATH func RequireVault(t *testing.T) { _, err := exec.Command("vault", "version").CombinedOutput() if err != nil { @@ -32,19 +33,41 @@ func RequireVault(t *testing.T) { } } +// RequireLinux skips tests unless: +// - running on Linux +func RequireLinux(t *testing.T) { + if runtime.GOOS != "linux" { + t.Skip("Test requires Linux") + } +} + +// ExecCompatible skips tests unless: +// - running as root +// - running on Linux func ExecCompatible(t *testing.T) { if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { - t.Skip("Test only available running as root on linux") + t.Skip("Test requires root on Linux") } - CgroupCompatible(t) } +// JavaCompatible skips tests unless: +// - "java" executable is detected on $PATH +// - running as root +// - running on Linux func JavaCompatible(t *testing.T) { - if runtime.GOOS == "linux" && syscall.Geteuid() != 0 { - t.Skip("Test only available when running as root on linux") + _, err := exec.Command("java", "-version").CombinedOutput() + if err != nil { + t.Skipf("Test requires Java: %v", err) + } + + if runtime.GOOS != "linux" || syscall.Geteuid() != 0 { + t.Skip("Test requires root on Linux") } } +// QemuCompatible skips tests unless: +// - "qemu-system-x86_64" executable is detected on $PATH (!windows) +// - "qemu-img" executable is detected on on $PATH (windows) func QemuCompatible(t *testing.T) { // Check if qemu exists bin := "qemu-system-x86_64" @@ -53,23 +76,19 @@ func QemuCompatible(t *testing.T) { } _, err := exec.Command(bin, "--version").CombinedOutput() if err != nil { - t.Skip("Must have Qemu installed for Qemu specific tests to run") - } -} - -func CgroupCompatible(t *testing.T) { - mount, err := cgutil.FindCgroupMountpointDir() - if err != nil || mount == "" { - t.Skipf("Failed to find cgroup mount: %v %v", mount, err) + t.Skipf("Test requires QEMU (%s)", bin) } } +// MountCompatible skips tests unless: +// - not running as windows +// - running as root func MountCompatible(t *testing.T) { if runtime.GOOS == "windows" { - t.Skip("Windows does not support mount") + t.Skip("Test requires not using Windows") } if syscall.Geteuid() != 0 { - t.Skip("Must be root to run test") + t.Skip("Test requires root") } } diff --git a/client/testutil/driver_compatible_default.go b/client/testutil/driver_compatible_default.go new file mode 100644 index 000000000000..2f7b3cb8a0ed --- /dev/null +++ b/client/testutil/driver_compatible_default.go @@ -0,0 +1,22 @@ +//go:build !linux + +package testutil + +import ( + "testing" +) + +// CgroupsCompatible returns false on non-Linux operating systems. +func CgroupsCompatible(t *testing.T) { + t.Skipf("Test requires cgroups support on Linux") +} + +// CgroupsCompatibleV1 skips tests on non-Linux operating systems. +func CgroupsCompatibleV1(t *testing.T) { + t.Skipf("Test requires cgroup.v1 support on Linux") +} + +// CgroupsCompatibleV2 skips tests on non-Linux operating systems. +func CgroupsCompatibleV2(t *testing.T) { + t.Skipf("Test requires cgroup.v2 support on Linux") +} diff --git a/client/testutil/driver_compatible_linux.go b/client/testutil/driver_compatible_linux.go new file mode 100644 index 000000000000..84ace6015f74 --- /dev/null +++ b/client/testutil/driver_compatible_linux.go @@ -0,0 +1,56 @@ +//go:build linux + +package testutil + +import ( + "runtime" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +// CgroupsCompatible returns true if either cgroups.v1 or cgroups.v2 is supported. +func CgroupsCompatible(t *testing.T) bool { + return cgroupsCompatibleV1(t) || cgroupsCompatibleV2(t) +} + +// CgroupsCompatibleV1 skips tests unless: +// - cgroup.v1 mount point is detected +func CgroupsCompatibleV1(t *testing.T) { + if !cgroupsCompatibleV1(t) { + t.Skipf("Test requires cgroup.v1 support") + } +} + +func cgroupsCompatibleV1(t *testing.T) bool { + if runtime.GOOS != "linux" { + return false + } + + if cgroupsCompatibleV2(t) { + t.Log("No cgroup.v1 mount point: running in cgroup.v2 mode") + return false + } + mount, err := cgroups.GetCgroupMounts(false) + if err != nil { + t.Logf("Unable to detect cgroup.v1 mount point: %v", err) + return false + } + if len(mount) == 0 { + t.Logf("No cgroup.v1 mount point: empty path") + return false + } + return true +} + +// CgroupsCompatibleV2 skips tests unless: +// - cgroup.v2 unified mode is detected +func CgroupsCompatibleV2(t *testing.T) { + if !cgroupsCompatibleV2(t) { + t.Skip("Test requires cgroup.v2 support") + } +} + +func cgroupsCompatibleV2(t *testing.T) bool { + return cgroups.IsCgroup2UnifiedMode() +} diff --git a/command/agent/config.go b/command/agent/config.go index 7ed47aa73d2e..9c062b1a9d53 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -1763,6 +1763,11 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { if b.BindWildcardDefaultHostNetwork { result.BindWildcardDefaultHostNetwork = true } + + if b.CgroupParent != "" { + result.CgroupParent = b.CgroupParent + } + return &result } diff --git a/drivers/docker/config.go b/drivers/docker/config.go index d45d6f24cab5..6194d60f1e46 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -754,7 +754,9 @@ func (d *Driver) SetConfig(c *base.Config) error { d.coordinator = newDockerCoordinator(coordinatorConfig) - d.reconciler = newReconciler(d) + d.danglingReconciler = newReconciler(d) + + d.cpusetFixer = newCpusetFixer(d) return nil } diff --git a/drivers/docker/config_test.go b/drivers/docker/config_test.go index df237a440202..d5f1acafc959 100644 --- a/drivers/docker/config_test.go +++ b/drivers/docker/config_test.go @@ -704,7 +704,7 @@ func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) { func TestConfig_DriverConfig_AllowRuntimes(t *testing.T) { ci.Parallel(t) - + cases := []struct { name string config string diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index f86879ebc475..d6e904204aef 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -20,6 +20,7 @@ import ( hclog "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" plugin "github.com/hashicorp/go-plugin" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/drivers/docker/docklog" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -122,7 +123,8 @@ type Driver struct { detected bool detectedLock sync.RWMutex - reconciler *containerReconciler + danglingReconciler *containerReconciler + cpusetFixer CpusetFixer } // NewDockerDriver returns a docker implementation of a driver plugin @@ -350,9 +352,14 @@ CREATE: container.ID, "container_state", container.State.String()) } - if containerCfg.HostConfig.CPUSet == "" && cfg.Resources.LinuxResources.CpusetCgroupPath != "" { - if err := setCPUSetCgroup(cfg.Resources.LinuxResources.CpusetCgroupPath, container.State.Pid); err != nil { - return nil, nil, fmt.Errorf("failed to set the cpuset cgroup for container: %v", err) + if !cgutil.UseV2 { + // This does not apply to cgroups.v2, which only allows setting the PID + // into exactly 1 group. For cgroups.v2, we use the cpuset fixer to reconcile + // the cpuset value into the cgroups created by docker in the background. + if containerCfg.HostConfig.CPUSet == "" && cfg.Resources.LinuxResources.CpusetCgroupPath != "" { + if err := setCPUSetCgroup(cfg.Resources.LinuxResources.CpusetCgroupPath, container.State.Pid); err != nil { + return nil, nil, fmt.Errorf("failed to set the cpuset cgroup for container: %v", err) + } } } @@ -841,7 +848,12 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T pidsLimit = driverConfig.PidsLimit } + // Extract the cgroup parent from the nomad cgroup (bypass the need for plugin config) + parent, _ := cgutil.SplitPath(task.Resources.LinuxResources.CpusetCgroupPath) + hostConfig := &docker.HostConfig{ + CgroupParent: parent, + Memory: memory, // hard limit MemoryReservation: memoryReservation, // soft limit diff --git a/drivers/docker/driver_darwin.go b/drivers/docker/driver_darwin.go index b0b1c81dfffc..2db02e4bf6d7 100644 --- a/drivers/docker/driver_darwin.go +++ b/drivers/docker/driver_darwin.go @@ -1,3 +1,5 @@ +//go:build darwin + package docker func setCPUSetCgroup(path string, pid int) error { diff --git a/drivers/docker/driver_darwin_test.go b/drivers/docker/driver_darwin_test.go index 18bfddd6828c..628e89a3e93e 100644 --- a/drivers/docker/driver_darwin_test.go +++ b/drivers/docker/driver_darwin_test.go @@ -1,3 +1,5 @@ +//go:build darwin + package docker import ( diff --git a/drivers/docker/driver_default.go b/drivers/docker/driver_default.go index 8607b899b199..b7bef7173408 100644 --- a/drivers/docker/driver_default.go +++ b/drivers/docker/driver_default.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package docker diff --git a/drivers/docker/driver_linux.go b/drivers/docker/driver_linux.go index b10d3241a670..97cf680b840d 100644 --- a/drivers/docker/driver_linux.go +++ b/drivers/docker/driver_linux.go @@ -1,3 +1,5 @@ +//go:build linux + package docker import ( @@ -7,7 +9,7 @@ import ( ) func setCPUSetCgroup(path string, pid int) error { - // Sometimes the container exists before we can write the + // Sometimes the container exits before we can write the // cgroup resulting in an error which can be ignored. err := cgroups.WriteCgroupProc(path, pid) if err != nil && strings.Contains(err.Error(), "no such process") { diff --git a/drivers/docker/driver_linux_test.go b/drivers/docker/driver_linux_test.go index 006517b28fef..88c83fbba5cb 100644 --- a/drivers/docker/driver_linux_test.go +++ b/drivers/docker/driver_linux_test.go @@ -1,3 +1,5 @@ +//go:build linux + package docker import ( diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 26cbf99161d2..cdffca518ae1 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -1507,9 +1507,7 @@ func TestDockerDriver_Init(t *testing.T) { func TestDockerDriver_CPUSetCPUs(t *testing.T) { ci.Parallel(t) testutil.DockerCompatible(t) - if runtime.GOOS == "windows" { - t.Skip("Windows does not support CPUSetCPUs.") - } + testutil.CgroupsCompatible(t) testCases := []struct { Name string diff --git a/drivers/docker/driver_unix_test.go b/drivers/docker/driver_unix_test.go index fb4fbba254e6..7f019af51614 100644 --- a/drivers/docker/driver_unix_test.go +++ b/drivers/docker/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package docker diff --git a/drivers/docker/driver_windows.go b/drivers/docker/driver_windows.go index 7eda62386f08..f3a4e22ec162 100644 --- a/drivers/docker/driver_windows.go +++ b/drivers/docker/driver_windows.go @@ -1,3 +1,5 @@ +//go:build windows + package docker import docker "github.com/fsouza/go-dockerclient" diff --git a/drivers/docker/driver_windows_test.go b/drivers/docker/driver_windows_test.go index f54b83f20956..765917231862 100644 --- a/drivers/docker/driver_windows_test.go +++ b/drivers/docker/driver_windows_test.go @@ -1,5 +1,4 @@ //go:build windows -// +build windows package docker diff --git a/drivers/docker/fingerprint.go b/drivers/docker/fingerprint.go index be5fd19c0594..f37a169788e1 100644 --- a/drivers/docker/fingerprint.go +++ b/drivers/docker/fingerprint.go @@ -13,9 +13,10 @@ import ( ) func (d *Driver) Fingerprint(ctx context.Context) (<-chan *drivers.Fingerprint, error) { - // start reconciler when we start fingerprinting - // this is the only method called when driver is launched properly - d.reconciler.Start() + // Start docker reconcilers when we start fingerprinting, a workaround for + // task drivers not having a kind of post-setup hook. + d.danglingReconciler.Start() + d.cpusetFixer.Start() ch := make(chan *drivers.Fingerprint) go d.handleFingerprint(ctx, ch) diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 7913796324a7..d4fc19601bd8 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -244,6 +244,12 @@ func (h *taskHandle) run() { } else if container.State.OOMKilled { oom = true werr = fmt.Errorf("OOM Killed") + } else if container.State.ExitCode == 137 { + // With cgroups.v2 it seems the cgroup OOM killer is not observed by docker + // container status. So just fudge the connection for now. + // [Mon Mar 21 19:48:21 2022] Memory cgroup out of memory: Killed process 92768 (sh) [...] + oom = true + werr = fmt.Errorf("OOM Killed (137)") } // Shutdown stats collection diff --git a/drivers/docker/reconcile_cpuset.go b/drivers/docker/reconcile_cpuset.go new file mode 100644 index 000000000000..4eab59ba00d9 --- /dev/null +++ b/drivers/docker/reconcile_cpuset.go @@ -0,0 +1,125 @@ +//go:build linux + +package docker + +import ( + "context" + "fmt" + "path/filepath" + "sync" + "time" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/lib/cgutil" + "github.com/hashicorp/nomad/helper" +) + +const ( + cpusetReconcileInterval = 1 * time.Second +) + +type CpusetFixer interface { + Start() +} + +// cpusetFixer adjusts the cpuset.cpus cgroup value to the assigned value by Nomad. +// +// Due to Docker not allowing the configuration of the full cgroup path, we must +// manually fix the cpuset values for all docker containers continuously, as the +// values will change as tasks of any driver using reserved cores are started and +// stopped, changing the size of the remaining shared cpu pool. +// +// The exec/java, podman, and containerd runtimes let you specify the cgroup path, +// making use of the cgroup Nomad creates and manages on behalf of the task. +type cpusetFixer struct { + ctx context.Context + logger hclog.Logger + interval time.Duration + once sync.Once + tasks func() map[coordinate]struct{} +} + +func newCpusetFixer(d *Driver) CpusetFixer { + return &cpusetFixer{ + interval: cpusetReconcileInterval, + ctx: d.ctx, + logger: d.logger, + tasks: d.trackedTasks, + } +} + +// Start will start the background cpuset reconciliation until the cf context is +// cancelled for shutdown. +// +// Only runs if cgroups.v2 is in use. +func (cf *cpusetFixer) Start() { + cf.once.Do(func() { + if cgutil.UseV2 { + go cf.loop() + } + }) +} + +func (cf *cpusetFixer) loop() { + timer, cancel := helper.NewSafeTimer(0) + defer cancel() + + for { + select { + case <-cf.ctx.Done(): + return + case <-timer.C: + timer.Stop() + cf.apply() + timer.Reset(cf.interval) + } + } +} + +func (cf *cpusetFixer) apply() { + coordinates := cf.tasks() + for c := range coordinates { + cf.fix(c) + } +} + +func (cf *cpusetFixer) fix(c coordinate) { + source := c.NomadCgroup() + destination := c.DockerCgroup() + if err := cgutil.CopyCpuset(source, destination); err != nil { + cf.logger.Debug("failed to copy cpuset", "err", err) + } +} + +type coordinate struct { + containerID string + allocID string + task string + path string +} + +func (c coordinate) NomadCgroup() string { + parent, _ := cgutil.SplitPath(c.path) + return filepath.Join(cgutil.CgroupRoot, parent, cgutil.CgroupScope(c.allocID, c.task)) +} + +func (c coordinate) DockerCgroup() string { + parent, _ := cgutil.SplitPath(c.path) + return filepath.Join(cgutil.CgroupRoot, parent, fmt.Sprintf("docker-%s.scope", c.containerID)) +} + +func (d *Driver) trackedTasks() map[coordinate]struct{} { + d.tasks.lock.RLock() + defer d.tasks.lock.RUnlock() + + m := make(map[coordinate]struct{}, len(d.tasks.store)) + for _, h := range d.tasks.store { + m[coordinate{ + containerID: h.containerID, + allocID: h.task.AllocID, + task: h.task.Name, + path: h.task.Resources.LinuxResources.CpusetCgroupPath, + }] = struct{}{} + } + return m +} diff --git a/drivers/docker/reconcile_cpuset_noop.go b/drivers/docker/reconcile_cpuset_noop.go new file mode 100644 index 000000000000..9613a5ad2ee0 --- /dev/null +++ b/drivers/docker/reconcile_cpuset_noop.go @@ -0,0 +1,19 @@ +//go:build !linux + +package docker + +type CpusetFixer interface { + Start() +} + +func newCpusetFixer(*Driver) CpusetFixer { + return new(noop) +} + +type noop struct { + // empty +} + +func (*noop) Start() { + // empty +} diff --git a/drivers/docker/reconcile_cpuset_test.go b/drivers/docker/reconcile_cpuset_test.go new file mode 100644 index 000000000000..07ed71294fa7 --- /dev/null +++ b/drivers/docker/reconcile_cpuset_test.go @@ -0,0 +1,36 @@ +//go:build linux + +package docker + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/stretchr/testify/require" +) + +func TestCoordinate_NomadCgroup(t *testing.T) { + ci.Parallel(t) + + result := (coordinate{ + containerID: "c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1", + allocID: "27ee5321-28d6-22d7-9426-4e1888da8e7d", + task: "redis", + path: "/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope", + }).NomadCgroup() + exp := "/sys/fs/cgroup/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope" + require.Equal(t, exp, result) +} + +func TestCoordinate_DockerCgroup(t *testing.T) { + ci.Parallel(t) + + result := (coordinate{ + containerID: "c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1", + allocID: "27ee5321-28d6-22d7-9426-4e1888da8e7d", + task: "redis", + path: "/nomad.scope/27ee5321-28d6-22d7-9426-4e1888da8e7d.redis.scope", + }).DockerCgroup() + exp := "/sys/fs/cgroup/nomad.scope/docker-c6d05b36f4f56619ca59fbce921115e87dda1661860a4670e3e35ecfa3571ba1.scope" + require.Equal(t, exp, result) +} diff --git a/drivers/docker/reconciler.go b/drivers/docker/reconcile_dangling.go similarity index 100% rename from drivers/docker/reconciler.go rename to drivers/docker/reconcile_dangling.go diff --git a/drivers/docker/reconciler_test.go b/drivers/docker/reconcile_dangling_test.go similarity index 100% rename from drivers/docker/reconciler_test.go rename to drivers/docker/reconcile_dangling_test.go diff --git a/drivers/docker/util/stats_posix.go b/drivers/docker/util/stats_posix.go index ba1167f344b2..3a9f03f8f539 100644 --- a/drivers/docker/util/stats_posix.go +++ b/drivers/docker/util/stats_posix.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package util diff --git a/drivers/docker/utils_unix_test.go b/drivers/docker/utils_unix_test.go index e53c72bec8ae..0a61bd545395 100644 --- a/drivers/docker/utils_unix_test.go +++ b/drivers/docker/utils_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package docker diff --git a/drivers/docker/utils_windows_test.go b/drivers/docker/utils_windows_test.go index d433055e9615..72d7e0bbfc85 100644 --- a/drivers/docker/utils_windows_test.go +++ b/drivers/docker/utils_windows_test.go @@ -1,5 +1,4 @@ //go:build windows -// +build windows package docker diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index 422b8b42aab9..882e3ae62f12 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -9,11 +9,10 @@ import ( "sync" "time" - "github.com/hashicorp/nomad/client/lib/cgutil" - "github.com/hashicorp/nomad/drivers/shared/capabilities" - "github.com/hashicorp/consul-template/signals" hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/lib/cgutil" + "github.com/hashicorp/nomad/drivers/shared/capabilities" "github.com/hashicorp/nomad/drivers/shared/eventer" "github.com/hashicorp/nomad/drivers/shared/executor" "github.com/hashicorp/nomad/drivers/shared/resolvconf" diff --git a/drivers/exec/driver_test.go b/drivers/exec/driver_test.go index f5c8d9a2b57f..fed60bea9623 100644 --- a/drivers/exec/driver_test.go +++ b/drivers/exec/driver_test.go @@ -17,6 +17,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" ctestutils "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/executor" "github.com/hashicorp/nomad/helper/pluginutils/hclutils" @@ -31,25 +32,41 @@ import ( "github.com/stretchr/testify/require" ) +const ( + cgroupParent = "testing.slice" +) + func TestMain(m *testing.M) { if !testtask.Run() { os.Exit(m.Run()) } } -var testResources = &drivers.Resources{ - NomadResources: &structs.AllocatedTaskResources{ - Memory: structs.AllocatedMemoryResources{ - MemoryMB: 128, +func testResources(allocID, task string) *drivers.Resources { + if allocID == "" || task == "" { + panic("must be set") + } + + r := &drivers.Resources{ + NomadResources: &structs.AllocatedTaskResources{ + Memory: structs.AllocatedMemoryResources{ + MemoryMB: 128, + }, + Cpu: structs.AllocatedCpuResources{ + CpuShares: 100, + }, }, - Cpu: structs.AllocatedCpuResources{ - CpuShares: 100, + LinuxResources: &drivers.LinuxResources{ + MemoryLimitBytes: 134217728, + CPUShares: 100, }, - }, - LinuxResources: &drivers.LinuxResources{ - MemoryLimitBytes: 134217728, - CPUShares: 100, - }, + } + + if cgutil.UseV2 { + r.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, cgroupParent, cgutil.CgroupScope(allocID, task)) + } + + return r } func TestExecDriver_Fingerprint_NonLinux(t *testing.T) { @@ -100,7 +117,6 @@ func TestExecDriver_Fingerprint(t *testing.T) { func TestExecDriver_StartWait(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -108,34 +124,35 @@ func TestExecDriver_StartWait(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "cat", Args: []string{"/proc/self/cgroup"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Zero(result.ExitCode) - require.NoError(harness.DestroyTask(task.ID, true)) + require.Zero(t, result.ExitCode) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_StartWaitStopKill(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -143,29 +160,31 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/bash", Args: []string{"-c", "echo hi; sleep 600"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { harness.StopTask(task.ID, 2*time.Second, "SIGINT") @@ -173,9 +192,9 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { select { case result := <-ch: - require.False(result.Successful()) + require.False(t, result.Successful()) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -191,54 +210,55 @@ func TestExecDriver_StartWaitStopKill(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_StartWaitRecover(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) - dctx, dcancel := context.WithCancel(context.Background()) - defer dcancel() + dCtx, dCancel := context.WithCancel(context.Background()) + defer dCancel() - d := NewExecDriver(dctx, testlog.HCLogger(t)) + d := NewExecDriver(dCtx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/sleep", Args: []string{"5"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) ch, err := harness.WaitTask(ctx, handle.Config.ID) - require.NoError(err) + require.NoError(t, err) var wg sync.WaitGroup wg.Add(1) go func() { defer wg.Done() result := <-ch - require.Error(result.Err) + require.Error(t, result.Err) }() - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) cancel() waitCh := make(chan struct{}) @@ -250,31 +270,30 @@ func TestExecDriver_StartWaitRecover(t *testing.T) { select { case <-waitCh: status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateRunning, status.State) + require.NoError(t, err) + require.Equal(t, drivers.TaskStateRunning, status.State) case <-time.After(1 * time.Second): - require.Fail("timeout waiting for task wait to cancel") + require.Fail(t, "timeout waiting for task wait to cancel") } // Loose task d.(*Driver).tasks.Delete(task.ID) _, err = harness.InspectTask(task.ID) - require.Error(err) + require.Error(t, err) - require.NoError(harness.RecoverTask(handle)) + require.NoError(t, harness.RecoverTask(handle)) status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateRunning, status.State) + require.NoError(t, err) + require.Equal(t, drivers.TaskStateRunning, status.State) - require.NoError(harness.StopTask(task.ID, 0, "")) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.StopTask(task.ID, 0, "")) + require.NoError(t, harness.DestroyTask(task.ID, true)) } // TestExecDriver_NoOrphans asserts that when the main // task dies, the orphans in the PID namespaces are killed by the kernel func TestExecDriver_NoOrphans(t *testing.T) { ci.Parallel(t) - r := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -291,13 +310,19 @@ func TestExecDriver_NoOrphans(t *testing.T) { } var data []byte - r.NoError(basePlug.MsgPackEncode(&data, config)) + require.NoError(t, basePlug.MsgPackEncode(&data, config)) baseConfig := &basePlug.Config{PluginConfig: data} - r.NoError(harness.SetConfig(baseConfig)) + require.NoError(t, harness.SetConfig(baseConfig)) + allocID := uuid.Generate() task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "test", + AllocID: allocID, + ID: uuid.Generate(), + Name: "test", + } + + if cgutil.UseV2 { + task.Resources = testResources(allocID, "test") } cleanup := harness.MkAllocDir(task, true) @@ -307,21 +332,21 @@ func TestExecDriver_NoOrphans(t *testing.T) { taskConfig["command"] = "/bin/sh" // print the child PID in the task PID namespace, then sleep for 5 seconds to give us a chance to examine processes taskConfig["args"] = []string{"-c", fmt.Sprintf(`sleep 3600 & sleep 20`)} - r.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) handle, _, err := harness.StartTask(task) - r.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) waitCh, err := harness.WaitTask(context.Background(), handle.Config.ID) - r.NoError(err) + require.NoError(t, err) - r.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) var childPids []int taskState := TaskState{} testutil.WaitForResult(func() (bool, error) { - r.NoError(handle.GetDriverState(&taskState)) + require.NoError(t, handle.GetDriverState(&taskState)) if taskState.Pid == 0 { return false, fmt.Errorf("task PID is zero") } @@ -343,14 +368,14 @@ func TestExecDriver_NoOrphans(t *testing.T) { } return true, nil }, func(err error) { - r.NoError(err) + require.NoError(t, err) }) select { case result := <-waitCh: - r.True(result.Successful(), "command failed: %#v", result) + require.True(t, result.Successful(), "command failed: %#v", result) case <-time.After(30 * time.Second): - r.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // isProcessRunning returns an error if process is not running @@ -369,7 +394,7 @@ func TestExecDriver_NoOrphans(t *testing.T) { } // task should be dead - r.Error(isProcessRunning(taskState.Pid)) + require.Error(t, isProcessRunning(taskState.Pid)) // all children should eventually be killed by OS testutil.WaitForResult(func() (bool, error) { @@ -384,13 +409,12 @@ func TestExecDriver_NoOrphans(t *testing.T) { } return true, nil }, func(err error) { - r.NoError(err) + require.NoError(t, err) }) } func TestExecDriver_Stats(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) dctx, dcancel := context.WithCancel(context.Background()) @@ -398,45 +422,47 @@ func TestExecDriver_Stats(t *testing.T) { d := NewExecDriver(dctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } tc := &TaskConfig{ Command: "/bin/sleep", Args: []string{"5"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) ctx, cancel := context.WithCancel(context.Background()) defer cancel() statsCh, err := harness.TaskStats(ctx, task.ID, time.Second*10) - require.NoError(err) + require.NoError(t, err) select { case stats := <-statsCh: - require.NotEmpty(stats.ResourceUsage.MemoryStats.Measured) - require.NotZero(stats.Timestamp) - require.WithinDuration(time.Now(), time.Unix(0, stats.Timestamp), time.Second) + require.NotEmpty(t, stats.ResourceUsage.MemoryStats.Measured) + require.NotZero(t, stats.Timestamp) + require.WithinDuration(t, time.Now(), time.Unix(0, stats.Timestamp), time.Second) case <-time.After(time.Second): - require.Fail("timeout receiving from channel") + require.Fail(t, "timeout receiving from channel") } - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -444,10 +470,12 @@ func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "test"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -461,34 +489,33 @@ func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { fmt.Sprintf(`sleep 1; echo -n %s > /alloc/%s`, string(exp), file), }, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) // Task should terminate quickly waitCh, err := harness.WaitTask(context.Background(), task.ID) - require.NoError(err) + require.NoError(t, err) select { case res := <-waitCh: - require.True(res.Successful(), "task should have exited successfully: %v", res) + require.True(t, res.Successful(), "task should have exited successfully: %v", res) case <-time.After(time.Duration(testutil.TestMultiplier()*5) * time.Second): - require.Fail("timeout waiting for task") + require.Fail(t, "timeout waiting for task") } // Check that data was written to the shared alloc directory. outputFile := filepath.Join(task.TaskDir().SharedAllocDir, file) act, err := ioutil.ReadFile(outputFile) - require.NoError(err) - require.Exactly(exp, act) + require.NoError(t, err) + require.Exactly(t, exp, act) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_User(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -496,11 +523,13 @@ func TestExecDriver_User(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", User: "alice", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -509,11 +538,11 @@ func TestExecDriver_User(t *testing.T) { Command: "/bin/sleep", Args: []string{"100"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.Error(err) - require.Nil(handle) + require.Error(t, err) + require.Nil(t, handle) msg := "user alice" if !strings.Contains(err.Error(), msg) { @@ -525,18 +554,20 @@ func TestExecDriver_User(t *testing.T) { // executes commands inside the container. func TestExecDriver_HandlerExec(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) + ctestutils.CgroupsCompatibleV1(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -545,11 +576,11 @@ func TestExecDriver_HandlerExec(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - require.NoError(err) - require.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) // Exec a command that should work and dump the environment // TODO: enable section when exec env is fully loaded @@ -575,8 +606,8 @@ func TestExecDriver_HandlerExec(t *testing.T) { // Assert cgroup membership res, err := harness.ExecTask(task.ID, []string{"/bin/cat", "/proc/self/cgroup"}, time.Second) - require.NoError(err) - require.True(res.ExitResult.Successful()) + require.NoError(t, err) + require.True(t, res.ExitResult.Successful()) found := false for _, line := range strings.Split(string(res.Stdout), "\n") { // Every cgroup entry should be /nomad/$ALLOC_ID @@ -594,41 +625,41 @@ func TestExecDriver_HandlerExec(t *testing.T) { } found = true } - require.True(found, "exec'd command isn't in the task's cgroup") + require.True(t, found, "exec'd command isn't in the task's cgroup") // Exec a command that should fail res, err = harness.ExecTask(task.ID, []string{"/usr/bin/stat", "lkjhdsaflkjshowaisxmcvnlia"}, time.Second) - require.NoError(err) - require.False(res.ExitResult.Successful()) + require.NoError(t, err) + require.False(t, res.ExitResult.Successful()) if expected := "No such file or directory"; !bytes.Contains(res.Stdout, []byte(expected)) { t.Fatalf("expected output to contain %q but found: %q", expected, res.Stdout) } - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestExecDriver_DevicesAndMounts(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) tmpDir, err := ioutil.TempDir("", "exec_binds_mounts") - require.NoError(err) + require.NoError(t, err) defer os.RemoveAll(tmpDir) err = ioutil.WriteFile(filepath.Join(tmpDir, "testfile"), []byte("from-host"), 600) - require.NoError(err) + require.NoError(t, err) ctx, cancel := context.WithCancel(context.Background()) defer cancel() d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ ID: uuid.Generate(), Name: "test", User: "root", // need permission to read mounts paths - Resources: testResources, + Resources: testResources(allocID, "test"), StdoutPath: filepath.Join(tmpDir, "task-stdout"), StderrPath: filepath.Join(tmpDir, "task-stderr"), Devices: []*drivers.DeviceConfig{ @@ -652,8 +683,8 @@ func TestExecDriver_DevicesAndMounts(t *testing.T) { }, } - require.NoError(ioutil.WriteFile(task.StdoutPath, []byte{}, 660)) - require.NoError(ioutil.WriteFile(task.StderrPath, []byte{}, 660)) + require.NoError(t, ioutil.WriteFile(task.StdoutPath, []byte{}, 660)) + require.NoError(t, ioutil.WriteFile(task.StderrPath, []byte{}, 660)) tc := &TaskConfig{ Command: "/bin/bash", @@ -669,38 +700,38 @@ touch /tmp/task-path-ro/testfile-from-ro && echo from-exec > /tmp/task-path-ro/ exit 0 `}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) stdout, err := ioutil.ReadFile(task.StdoutPath) - require.NoError(err) - require.Equal(`mounted device /inserted-random: 1:8 + require.NoError(t, err) + require.Equal(t, `mounted device /inserted-random: 1:8 reading from ro path: from-host reading from rw path: from-host overwriting file in rw succeeded writing new file in rw succeeded`, strings.TrimSpace(string(stdout))) stderr, err := ioutil.ReadFile(task.StderrPath) - require.NoError(err) - require.Equal(`touch: cannot touch '/tmp/task-path-ro/testfile': Read-only file system + require.NoError(t, err) + require.Equal(t, `touch: cannot touch '/tmp/task-path-ro/testfile': Read-only file system touch: cannot touch '/tmp/task-path-ro/testfile-from-ro': Read-only file system`, strings.TrimSpace(string(stderr))) // testing exit code last so we can inspect output first - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) fromRWContent, err := ioutil.ReadFile(filepath.Join(tmpDir, "testfile-from-rw")) - require.NoError(err) - require.Equal("from-exec", strings.TrimSpace(string(fromRWContent))) + require.NoError(t, err) + require.Equal(t, "from-exec", strings.TrimSpace(string(fromRWContent))) } func TestConfig_ParseAllHCL(t *testing.T) { @@ -719,13 +750,11 @@ config { var tc *TaskConfig hclutils.NewConfigParser(taskConfigSpec).ParseHCL(t, cfgStr, &tc) - require.EqualValues(t, expected, tc) } func TestExecDriver_NoPivotRoot(t *testing.T) { ci.Parallel(t) - r := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -741,14 +770,16 @@ func TestExecDriver_NoPivotRoot(t *testing.T) { } var data []byte - r.NoError(basePlug.MsgPackEncode(&data, config)) + require.NoError(t, basePlug.MsgPackEncode(&data, config)) bconfig := &basePlug.Config{PluginConfig: data} - r.NoError(harness.SetConfig(bconfig)) + require.NoError(t, harness.SetConfig(bconfig)) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "sleep", - Resources: testResources, + Resources: testResources(allocID, "sleep"), } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -757,11 +788,11 @@ func TestExecDriver_NoPivotRoot(t *testing.T) { Command: "/bin/sleep", Args: []string{"100"}, } - r.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) handle, _, err := harness.StartTask(task) - r.NoError(err) - r.NotNil(handle) + require.NoError(t, err) + require.NotNil(t, handle) } func TestDriver_Config_validate(t *testing.T) { diff --git a/drivers/exec/driver_unix_test.go b/drivers/exec/driver_unix_test.go index 6d62902f8a0c..ce765835dc3c 100644 --- a/drivers/exec/driver_unix_test.go +++ b/drivers/exec/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build darwin dragonfly freebsd linux netbsd openbsd solaris package exec @@ -11,6 +10,7 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" ctestutils "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/capabilities" "github.com/hashicorp/nomad/drivers/shared/executor" @@ -26,7 +26,6 @@ import ( func TestExecDriver_StartWaitStop(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctestutils.ExecCompatible(t) ctx, cancel := context.WithCancel(context.Background()) @@ -34,29 +33,31 @@ func TestExecDriver_StartWaitStop(t *testing.T) { d := NewExecDriver(ctx, testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) + allocID := uuid.Generate() task := &drivers.TaskConfig{ + AllocID: allocID, ID: uuid.Generate(), Name: "test", - Resources: testResources, + Resources: testResources(allocID, "test"), } taskConfig := map[string]interface{}{ "command": "/bin/sleep", "args": []string{"600"}, } - require.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) cleanup := harness.MkAllocDir(task, false) defer cleanup() handle, _, err := harness.StartTask(task) defer harness.DestroyTask(task.ID, true) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { harness.StopTask(task.ID, 2*time.Second, "SIGKILL") @@ -64,9 +65,9 @@ func TestExecDriver_StartWaitStop(t *testing.T) { select { case result := <-ch: - require.Equal(int(unix.SIGKILL), result.Signal) + require.Equal(t, int(unix.SIGKILL), result.Signal) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -82,13 +83,12 @@ func TestExecDriver_StartWaitStop(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) } func TestExec_ExecTaskStreaming(t *testing.T) { ci.Parallel(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -102,6 +102,12 @@ func TestExec_ExecTaskStreaming(t *testing.T) { Name: "sleep", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = testResources(allocID, "sleep") + } + cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -109,14 +115,13 @@ func TestExec_ExecTaskStreaming(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.ExecTaskStreamingConformanceTests(t, harness, task.ID) - } // Tests that a given DNSConfig properly configures dns @@ -124,7 +129,6 @@ func TestExec_dnsConfig(t *testing.T) { ci.Parallel(t) ctestutils.RequireRoot(t) ctestutils.ExecCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -163,6 +167,11 @@ func TestExec_dnsConfig(t *testing.T) { DNS: c.cfg, } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.Resources = testResources(allocID, "sleep") + } + cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -170,10 +179,10 @@ func TestExec_dnsConfig(t *testing.T) { Command: "/bin/sleep", Args: []string{"9000"}, } - require.NoError(task.EncodeConcreteDriverConfig(&tc)) + require.NoError(t, task.EncodeConcreteDriverConfig(&tc)) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.TestTaskDNSConfig(t, harness, task.ID, c.cfg) @@ -189,6 +198,12 @@ func TestExecDriver_Capabilities(t *testing.T) { Name: "sleep", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = testResources(allocID, "sleep") + } + for _, tc := range []struct { Name string CapAdd []string diff --git a/drivers/java/driver_test.go b/drivers/java/driver_test.go index 63cb4c12043d..cc7a74378053 100644 --- a/drivers/java/driver_test.go +++ b/drivers/java/driver_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "github.com/hashicorp/nomad/client/lib/cgutil" "io" "io/ioutil" "os" @@ -12,14 +13,13 @@ import ( "time" "github.com/hashicorp/nomad/ci" - dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" - ctestutil "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/pluginutils/hclutils" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" + dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -60,7 +60,6 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -72,6 +71,7 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { Args: []string{"1"}, JvmOpts: []string{"-Xmx64m", "-Xms32m"}, } + task := basicTask(t, "demo-app", tc) cleanup := harness.MkAllocDir(task, true) @@ -80,28 +80,27 @@ func TestJavaDriver_Jar_Start_Wait(t *testing.T) { copyFile("./test-resources/demoapp.jar", filepath.Join(task.TaskDir().Dir, "demoapp.jar"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Nil(result.Err) + require.Nil(t, result.Err) - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) // Get the stdout of the process and assert that it's not empty stdout, err := ioutil.ReadFile(filepath.Join(task.TaskDir().LogDir, "demo-app.stdout.0")) - require.NoError(err) - require.Contains(string(stdout), "Hello") + require.NoError(t, err) + require.Contains(t, string(stdout), "Hello") - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -121,12 +120,12 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { copyFile("./test-resources/demoapp.jar", filepath.Join(task.TaskDir().Dir, "demoapp.jar"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + require.NoError(t, harness.WaitUntilStarted(task.ID, 1*time.Second)) go func() { time.Sleep(10 * time.Millisecond) @@ -135,9 +134,9 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { select { case result := <-ch: - require.False(result.Successful()) + require.False(t, result.Successful()) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } // Ensure that the task is marked as dead, but account @@ -153,17 +152,16 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaDriver_Class_Start_Wait(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -182,21 +180,21 @@ func TestJavaDriver_Class_Start_Wait(t *testing.T) { copyFile("./test-resources/Hello.class", filepath.Join(task.TaskDir().Dir, "Hello.class"), t) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) result := <-ch - require.Nil(result.Err) + require.Nil(t, result.Err) - require.Zero(result.ExitCode) + require.Zero(t, result.ExitCode) // Get the stdout of the process and assert that it's not empty stdout, err := ioutil.ReadFile(filepath.Join(task.TaskDir().LogDir, "demo-app.stdout.0")) - require.NoError(err) - require.Contains(string(stdout), "Hello") + require.NoError(t, err) + require.Contains(t, string(stdout), "Hello") - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) } func TestJavaCmdArgs(t *testing.T) { @@ -254,7 +252,6 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { ci.Parallel(t) javaCompatible(t) - require := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -274,7 +271,7 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { copyFile("./test-resources/Hello.class", filepath.Join(task.TaskDir().Dir, "Hello.class"), t) _, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer d.DestroyTask(task.ID, true) dtestutil.ExecTaskStreamingConformanceTests(t, harness, task.ID) @@ -283,9 +280,11 @@ func TestJavaDriver_ExecTaskStreaming(t *testing.T) { func basicTask(t *testing.T, name string, taskConfig *TaskConfig) *drivers.TaskConfig { t.Helper() + allocID := uuid.Generate() task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: name, + AllocID: allocID, + ID: uuid.Generate(), + Name: name, Resources: &drivers.Resources{ NomadResources: &structs.AllocatedTaskResources{ Memory: structs.AllocatedMemoryResources{ @@ -302,6 +301,10 @@ func basicTask(t *testing.T, name string, taskConfig *TaskConfig) *drivers.TaskC }, } + if cgutil.UseV2 { + task.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, name)) + } + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) return task } diff --git a/drivers/rawexec/driver_test.go b/drivers/rawexec/driver_test.go index 4b76350980fe..bac01470c708 100644 --- a/drivers/rawexec/driver_test.go +++ b/drivers/rawexec/driver_test.go @@ -323,6 +323,8 @@ func TestRawExecDriver_Start_Wait_AllocDir(t *testing.T) { func TestRawExecDriver_Start_Kill_Wait_Cgroup(t *testing.T) { ci.Parallel(t) ctestutil.ExecCompatible(t) + ctestutil.CgroupsCompatibleV1(t) // todo(shoenig) #12348 + require := require.New(t) pidFile := "pid" @@ -414,6 +416,9 @@ func TestRawExecDriver_Start_Kill_Wait_Cgroup(t *testing.T) { func TestRawExecDriver_Exec(t *testing.T) { ci.Parallel(t) + ctestutil.ExecCompatible(t) + ctestutil.CgroupsCompatibleV1(t) // todo(shoenig) #12348 + require := require.New(t) d := newEnabledRawExecDriver(t) diff --git a/drivers/rawexec/driver_unix_test.go b/drivers/rawexec/driver_unix_test.go index bf7e0c4e2c35..895906bc9131 100644 --- a/drivers/rawexec/driver_unix_test.go +++ b/drivers/rawexec/driver_unix_test.go @@ -1,5 +1,4 @@ //go:build !windows -// +build !windows package rawexec @@ -18,8 +17,11 @@ import ( "time" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/lib/cgutil" + clienttestutil "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/testtask" "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" basePlug "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/drivers" dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" @@ -30,9 +32,7 @@ import ( func TestRawExecDriver_User(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } + clienttestutil.RequireLinux(t) require := require.New(t) d := newEnabledRawExecDriver(t) @@ -205,13 +205,8 @@ func TestRawExecDriver_StartWaitStop(t *testing.T) { // task processes are cleaned up. func TestRawExecDriver_DestroyKillsAll(t *testing.T) { ci.Parallel(t) - - // This only works reliably with cgroup PID tracking, happens in linux only - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } - - require := require.New(t) + clienttestutil.RequireLinux(t) + clienttestutil.CgroupsCompatibleV1(t) // todo(shoenig): #12348 d := newEnabledRawExecDriver(t) harness := dtestutil.NewDriverHarness(t, d) @@ -222,6 +217,15 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { Name: "test", } + if cgutil.UseV2 { + allocID := uuid.Generate() + task.AllocID = allocID + task.Resources = new(drivers.Resources) + task.Resources.NomadResources = new(structs.AllocatedTaskResources) + task.Resources.LinuxResources = new(drivers.LinuxResources) + task.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, "test")) + } + cleanup := harness.MkAllocDir(task, true) defer cleanup() @@ -229,20 +233,20 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { taskConfig["command"] = "/bin/sh" taskConfig["args"] = []string{"-c", fmt.Sprintf(`sleep 3600 & echo "SLEEP_PID=$!"`)} - require.NoError(task.EncodeConcreteDriverConfig(&taskConfig)) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig)) handle, _, err := harness.StartTask(task) - require.NoError(err) + require.NoError(t, err) defer harness.DestroyTask(task.ID, true) ch, err := harness.WaitTask(context.Background(), handle.Config.ID) - require.NoError(err) + require.NoError(t, err) select { case result := <-ch: - require.True(result.Successful(), "command failed: %#v", result) + require.True(t, result.Successful(), "command failed: %#v", result) case <-time.After(10 * time.Second): - require.Fail("timeout waiting for task to shutdown") + require.Fail(t, "timeout waiting for task to shutdown") } sleepPid := 0 @@ -268,7 +272,7 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { sleepPid = pid return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) // isProcessRunning returns an error if process is not running @@ -286,9 +290,9 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { return nil } - require.NoError(isProcessRunning(sleepPid)) + require.NoError(t, isProcessRunning(sleepPid)) - require.NoError(harness.DestroyTask(task.ID, true)) + require.NoError(t, harness.DestroyTask(task.ID, true)) testutil.WaitForResult(func() (bool, error) { err := isProcessRunning(sleepPid) @@ -302,7 +306,7 @@ func TestRawExecDriver_DestroyKillsAll(t *testing.T) { return true, nil }, func(err error) { - require.NoError(err) + require.NoError(t, err) }) } @@ -342,9 +346,7 @@ func TestRawExec_ExecTaskStreaming(t *testing.T) { func TestRawExec_ExecTaskStreaming_User(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("skip, requires running on Linux for testing custom user setting") - } + clienttestutil.RequireLinux(t) d := newEnabledRawExecDriver(t) harness := dtestutil.NewDriverHarness(t, d) @@ -381,9 +383,7 @@ func TestRawExec_ExecTaskStreaming_User(t *testing.T) { func TestRawExecDriver_NoCgroup(t *testing.T) { ci.Parallel(t) - if runtime.GOOS != "linux" { - t.Skip("Linux only test") - } + clienttestutil.RequireLinux(t) expectedBytes, err := ioutil.ReadFile("/proc/self/cgroup") require.NoError(t, err) diff --git a/drivers/shared/executor/executor.go b/drivers/shared/executor/executor.go index 5026730b6971..99819d85a66d 100644 --- a/drivers/shared/executor/executor.go +++ b/drivers/shared/executor/executor.go @@ -302,7 +302,8 @@ func (e *UniversalExecutor) Launch(command *ExecCommand) (*ProcessState, error) // Setup cgroups on linux if e.commandCfg.ResourceLimits || e.commandCfg.BasicProcessCgroup { - if err := e.configureResourceContainer(os.Getpid()); err != nil { + pid := os.Getpid() + if err := e.configureResourceContainer(pid); err != nil { return nil, err } } diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 989c4a062c4c..cdb14f64dba1 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -1,10 +1,10 @@ //go:build linux -// +build linux package executor import ( "context" + "errors" "fmt" "io" "os" @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/consul-template/signals" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/stats" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -37,10 +38,6 @@ import ( "golang.org/x/sys/unix" ) -const ( - defaultCgroupParent = "/nomad" -) - var ( // ExecutorCgroupV1MeasuredMemStats is the list of memory stats captured by the executor with cgroup-v1 ExecutorCgroupV1MeasuredMemStats = []string{"RSS", "Cache", "Swap", "Usage", "Max Usage", "Kernel Usage", "Kernel Max Usage"} @@ -71,7 +68,6 @@ type LibcontainerExecutor struct { } func NewExecutorWithIsolation(logger hclog.Logger) Executor { - logger = logger.Named("isolated_executor") if err := shelpers.Init(); err != nil { logger.Error("unable to initialize stats", "error", err) @@ -665,14 +661,27 @@ func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { } func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { - // If resources are not limited then manually create cgroups needed if !command.ResourceLimits { - return configureBasicCgroups(cfg) + return cgutil.ConfigureBasicCgroups(cfg) } - id := uuid.Generate() - cfg.Cgroups.Path = filepath.Join("/", defaultCgroupParent, id) + // set cgroups path + if cgutil.UseV2 { + // in v2, the cgroup must have been created by the client already, + // which breaks a lot of existing tests that run drivers without a client + if command.Resources == nil || command.Resources.LinuxResources == nil || command.Resources.LinuxResources.CpusetCgroupPath == "" { + return errors.New("cgroup path must be set") + } + parent, cgroup := cgutil.SplitPath(command.Resources.LinuxResources.CpusetCgroupPath) + cfg.Cgroups.Path = filepath.Join("/", parent, cgroup) + } else { + // in v1, the cgroup is created using /nomad, which is a bug because it + // does not respect the cgroup_parent client configuration + // (but makes testing easy) + id := uuid.Generate() + cfg.Cgroups.Path = filepath.Join("/", cgutil.DefaultCgroupV1Parent, id) + } if command.Resources == nil || command.Resources.NomadResources == nil { return nil @@ -715,44 +724,6 @@ func configureCgroups(cfg *lconfigs.Config, command *ExecCommand) error { return nil } -func configureBasicCgroups(cfg *lconfigs.Config) error { - id := uuid.Generate() - - // Manually create freezer cgroup - - subsystem := "freezer" - - path, err := getCgroupPathHelper(subsystem, filepath.Join(defaultCgroupParent, id)) - if err != nil { - return fmt.Errorf("failed to find %s cgroup mountpoint: %v", subsystem, err) - } - - if err = os.MkdirAll(path, 0755); err != nil { - return err - } - - cfg.Cgroups.Paths = map[string]string{ - subsystem: path, - } - return nil -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - func newLibcontainerConfig(command *ExecCommand) (*lconfigs.Config, error) { cfg := &lconfigs.Config{ Cgroups: &lconfigs.Cgroup{ diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 67a991111471..d38072ad561c 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -82,6 +83,12 @@ func testExecutorCommandWithChroot(t *testing.T) *testExecCmd { }, } + if cgutil.UseV2 { + cmd.Resources.LinuxResources = &drivers.LinuxResources{ + CpusetCgroupPath: filepath.Join(cgutil.CgroupRoot, "testing.scope", cgutil.CgroupScope(alloc.ID, task.Name)), + } + } + testCmd := &testExecCmd{ command: cmd, allocDir: allocDir, @@ -164,8 +171,10 @@ func TestExecutor_Isolation_PID_and_IPC_hostMode(t *testing.T) { func TestExecutor_IsolationAndConstraints(t *testing.T) { ci.Parallel(t) - r := require.New(t) testutil.ExecCompatible(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig): hard codes cgroups v1 lookup + + r := require.New(t) testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir @@ -256,9 +265,10 @@ passwd` // hierarchy created for this process func TestExecutor_CgroupPaths(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + require := require.New(t) + testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir execCmd.Cmd = "/bin/bash" @@ -280,27 +290,33 @@ func TestExecutor_CgroupPaths(t *testing.T) { tu.WaitForResult(func() (bool, error) { output := strings.TrimSpace(testExecCmd.stdout.String()) - // Verify that we got some cgroups - if !strings.Contains(output, ":devices:") { - return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) - } - lines := strings.Split(output, "\n") - for _, line := range lines { - // Every cgroup entry should be /nomad/$ALLOC_ID - if line == "" { - continue + switch cgutil.UseV2 { + case true: + isScope := strings.HasSuffix(output, ".scope") + require.True(isScope) + case false: + // Verify that we got some cgroups + if !strings.Contains(output, ":devices:") { + return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) } + lines := strings.Split(output, "\n") + for _, line := range lines { + // Every cgroup entry should be /nomad/$ALLOC_ID + if line == "" { + continue + } - // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker - // don't isolate it by default. - // :: filters out odd empty cgroup found in latest Ubuntu lines, e.g. 0::/user.slice/user-1000.slice/session-17.scope - // that is also not used for isolation - if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { - continue - } + // Skip rdma & misc subsystem; rdma was added in most recent kernels and libcontainer/docker + // don't isolate it by default. + // :: filters out odd empty cgroup found in latest Ubuntu lines, e.g. 0::/user.slice/user-1000.slice/session-17.scope + // that is also not used for isolation + if strings.Contains(line, ":rdma:") || strings.Contains(line, ":misc:") || strings.Contains(line, "::") { + continue + } + if !strings.Contains(line, ":/nomad/") { + return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + } - if !strings.Contains(line, ":/nomad/") { - return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) } } return true, nil @@ -311,9 +327,10 @@ func TestExecutor_CgroupPaths(t *testing.T) { // are destroyed on shutdown func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + require := require.New(t) + testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir execCmd.Cmd = "/bin/bash" @@ -336,28 +353,34 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { var cgroupsPaths string tu.WaitForResult(func() (bool, error) { output := strings.TrimSpace(testExecCmd.stdout.String()) - // Verify that we got some cgroups - if !strings.Contains(output, ":devices:") { - return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) - } - lines := strings.Split(output, "\n") - for _, line := range lines { - // Every cgroup entry should be /nomad/$ALLOC_ID - if line == "" { - continue - } - // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker - // don't isolate it by default. - if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { - continue + switch cgutil.UseV2 { + case true: + isScope := strings.HasSuffix(output, ".scope") + require.True(isScope) + case false: + // Verify that we got some cgroups + if !strings.Contains(output, ":devices:") { + return false, fmt.Errorf("was expected cgroup files but found:\n%v", output) } + lines := strings.Split(output, "\n") + for _, line := range lines { + // Every cgroup entry should be /nomad/$ALLOC_ID + if line == "" { + continue + } - if !strings.Contains(line, ":/nomad/") { - return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker + // don't isolate it by default. + if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { + continue + } + + if !strings.Contains(line, ":/nomad/") { + return false, fmt.Errorf("Not a member of the alloc's cgroup: expected=...:/nomad/... -- found=%q", line) + } } } - cgroupsPaths = output return true, nil }, func(err error) { t.Error(err) }) @@ -383,7 +406,7 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { continue } - p, err := getCgroupPathHelper(subsystem, cgroup) + p, err := cgutil.GetCgroupPathHelperV1(subsystem, cgroup) require.NoError(err) require.Falsef(cgroups.PathExists(p), "cgroup for %s %s still exists", subsystem, cgroup) } @@ -433,8 +456,10 @@ func TestUniversalExecutor_LookupTaskBin(t *testing.T) { // Exec Launch looks for the binary only inside the chroot func TestExecutor_EscapeContainer(t *testing.T) { ci.Parallel(t) - require := require.New(t) testutil.ExecCompatible(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig) kills the terminal, probably defaulting to / + + require := require.New(t) testExecCmd := testExecutorCommandWithChroot(t) execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir diff --git a/drivers/shared/executor/executor_test.go b/drivers/shared/executor/executor_test.go index 4d930843c1bf..57fbbb015d0e 100644 --- a/drivers/shared/executor/executor_test.go +++ b/drivers/shared/executor/executor_test.go @@ -18,7 +18,9 @@ import ( hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/taskenv" + "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -90,6 +92,10 @@ func testExecutorCommand(t *testing.T) *testExecCmd { }, } + if cgutil.UseV2 { + cmd.Resources.LinuxResources.CpusetCgroupPath = filepath.Join(cgutil.CgroupRoot, "testing.scope", cgutil.CgroupScope(alloc.ID, task.Name)) + } + testCmd := &testExecCmd{ command: cmd, allocDir: allocDir, @@ -253,6 +259,8 @@ func TestExecutor_Start_Wait_Children(t *testing.T) { func TestExecutor_WaitExitSignal(t *testing.T) { ci.Parallel(t) + testutil.CgroupsCompatibleV1(t) // todo(shoenig) #12351 + for name, factory := range executorFactories { t.Run(name, func(t *testing.T) { testExecCmd := testExecutorCommand(t) @@ -519,6 +527,7 @@ func copyFile(t *testing.T, src, dst string) { func TestExecutor_Start_Kill_Immediately_NoGrace(t *testing.T) { ci.Parallel(t) for name, factory := range executorFactories { + t.Run(name, func(t *testing.T) { require := require.New(t) testExecCmd := testExecutorCommand(t) diff --git a/drivers/shared/executor/executor_universal_linux.go b/drivers/shared/executor/executor_universal_linux.go index d5076a8d3d6a..7c97681df55c 100644 --- a/drivers/shared/executor/executor_universal_linux.go +++ b/drivers/shared/executor/executor_universal_linux.go @@ -10,6 +10,7 @@ import ( "github.com/containernetworking/plugins/pkg/ns" multierror "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/plugins/drivers" "github.com/opencontainers/runc/libcontainer/cgroups" cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" @@ -77,8 +78,7 @@ func (e *UniversalExecutor) configureResourceContainer(pid int) error { cfg.Cgroups.Resources.Devices = append(cfg.Cgroups.Resources.Devices, &device.Rule) } - err := configureBasicCgroups(cfg) - if err != nil { + if err := cgutil.ConfigureBasicCgroups(cfg); err != nil { // Log this error to help diagnose cases where nomad is run with too few // permissions, but don't return an error. There is no separate check for // cgroup creation permissions, so this may be the happy path. diff --git a/drivers/shared/executor/client.go b/drivers/shared/executor/grpc_client.go similarity index 100% rename from drivers/shared/executor/client.go rename to drivers/shared/executor/grpc_client.go diff --git a/drivers/shared/executor/server.go b/drivers/shared/executor/grpc_server.go similarity index 100% rename from drivers/shared/executor/server.go rename to drivers/shared/executor/grpc_server.go diff --git a/drivers/shared/executor/resource_container_linux.go b/drivers/shared/executor/resource_container_linux.go index 50ab7cef9e6e..dbdf83f16282 100644 --- a/drivers/shared/executor/resource_container_linux.go +++ b/drivers/shared/executor/resource_container_linux.go @@ -30,6 +30,7 @@ func (rc *resourceContainerContext) isEmpty() bool { return rc.groups == nil } +// todo(shoenig) cgroups.v2 #12351 func (rc *resourceContainerContext) getAllPidsByCgroup() (map[int]*nomadPid, error) { nPids := map[int]*nomadPid{} diff --git a/go.mod b/go.mod index 0218687a452d..11a3750bdcc3 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,7 @@ require ( github.com/mitchellh/mapstructure v1.4.3 github.com/mitchellh/reflectwalk v1.0.2 github.com/moby/sys/mount v0.3.0 - github.com/moby/sys/mountinfo v0.5.0 + github.com/moby/sys/mountinfo v0.6.0 github.com/opencontainers/runc v1.0.3 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 github.com/pkg/errors v0.9.1 @@ -162,9 +162,9 @@ require ( github.com/boltdb/bolt v1.3.1 // indirect github.com/census-instrumentation/opencensus-proto v0.3.0 // indirect github.com/cespare/xxhash/v2 v2.1.2 // indirect - github.com/checkpoint-restore/go-criu/v5 v5.0.0 // indirect + github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cheggaaa/pb/v3 v3.0.5 // indirect - github.com/cilium/ebpf v0.6.2 // indirect + github.com/cilium/ebpf v0.8.1 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4 // indirect @@ -173,7 +173,7 @@ require ( github.com/containerd/console v1.0.3 // indirect github.com/containerd/containerd v1.5.9 // indirect github.com/coreos/go-systemd/v22 v22.3.2 // indirect - github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 // indirect + github.com/cyphar/filepath-securejoin v0.2.3 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba // indirect github.com/digitalocean/godo v1.10.0 // indirect @@ -186,7 +186,7 @@ require ( github.com/envoyproxy/protoc-gen-validate v0.6.2 // indirect github.com/felixge/httpsnoop v1.0.1 // indirect github.com/go-ole/go-ole v1.2.6 // indirect - github.com/godbus/dbus/v5 v5.0.4 // indirect + github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/gojuno/minimock/v3 v3.0.6 // indirect github.com/golang-jwt/jwt/v4 v4.0.0 // indirect @@ -232,7 +232,7 @@ require ( github.com/oklog/run v1.0.1-0.20180308005104-6934b124db28 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.0.2 // indirect - github.com/opencontainers/selinux v1.8.2 // indirect + github.com/opencontainers/selinux v1.10.0 // indirect github.com/packethost/packngo v0.1.1-0.20180711074735-b9cb5096f54c // indirect github.com/pierrec/lz4 v2.5.2+incompatible // indirect github.com/pmezard/go-difflib v1.0.0 // indirect @@ -241,7 +241,7 @@ require ( github.com/prometheus/procfs v0.7.3 // indirect github.com/renier/xmlrpc v0.0.0-20170708154548-ce4a1a486c03 // indirect github.com/rogpeppe/go-internal v1.6.1 // indirect - github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128 // indirect + github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 // indirect github.com/sirupsen/logrus v1.8.1 // indirect github.com/softlayer/softlayer-go v0.0.0-20180806151055-260589d94c7d // indirect github.com/stretchr/objx v0.2.0 // indirect diff --git a/go.sum b/go.sum index 9a472d9f02ae..ff8ba072fda7 100644 --- a/go.sum +++ b/go.sum @@ -209,8 +209,9 @@ github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/checkpoint-restore/go-criu/v4 v4.1.0/go.mod h1:xUQBLp4RLc5zJtWY++yjOoMoB5lihDt7fai+75m+rGw= -github.com/checkpoint-restore/go-criu/v5 v5.0.0 h1:TW8f/UvntYoVDMN1K2HlT82qH1rb0sOjpGw3m6Ym+i4= github.com/checkpoint-restore/go-criu/v5 v5.0.0/go.mod h1:cfwC0EG7HMUenopBsUf9d89JlCLQIfgVcNsNN0t6T2M= +github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8= +github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E= github.com/cheggaaa/pb v1.0.27 h1:wIkZHkNfC7R6GI5w7l/PdAdzXzlrbcI3p8OAlnkTsnc= github.com/cheggaaa/pb v1.0.27/go.mod h1:pQciLPpbU0oxA0h+VJYYLxO+XeDQb5pZijXscXHm81s= github.com/cheggaaa/pb/v3 v3.0.5 h1:lmZOti7CraK9RSjzExsY53+WWfub9Qv13B5m4ptEoPE= @@ -222,8 +223,9 @@ github.com/cilium/ebpf v0.0.0-20200110133405-4032b1d8aae3/go.mod h1:MA5e5Lr8slmE github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775/go.mod h1:7cR51M8ViRLIdUjrmSXlK9pkrsDlLHbO8jiB8X8JnOc= github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= -github.com/cilium/ebpf v0.6.2 h1:iHsfF/t4aW4heW2YKfeHrVPGdtYTL4C4KocpM8KTSnI= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= +github.com/cilium/ebpf v0.8.1 h1:bLSSEbBLqGPXxls55pGr5qWZaTqcmfDJHhou7t254ao= +github.com/cilium/ebpf v0.8.1/go.mod h1:f5zLIM0FSNuAkSyLAN7X+Hy6yznlF1mNiWUMfxMtrgk= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible h1:C29Ae4G5GtYyYMm1aztcyj/J5ckgJm2zwdDajFbx1NY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3 h1:TJH+oke8D16535+jHExHj4nQvzlZrj7ug5D7I/orNUA= @@ -372,8 +374,8 @@ github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ github.com/creack/pty v1.1.17 h1:QeVUsEDNrLBW4tMgZHvxy18sKtr6VI492kBhUfhDJNI= github.com/creack/pty v1.1.17/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= -github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1 h1:dCqRswe3ZAwkQWdvFLwRqmJCpGP3DWb7bFogdqY3+QU= -github.com/cyphar/filepath-securejoin v0.2.3-0.20190205144030-7efe413b52e1/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= +github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= +github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ= github.com/d2g/dhcp4client v1.0.0/go.mod h1:j0hNfjhrt2SxUOw55nL0ATM/z4Yt3t2Kd1mW34z5W5s= github.com/d2g/dhcp4server v0.0.0-20181031114812-7d4a0a7f59a5/go.mod h1:Eo87+Kg/IX2hfWJfwxMzLyuSZyxSoAug2nGa1G2QAi8= @@ -462,8 +464,9 @@ github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw github.com/form3tech-oss/jwt-go v3.2.2+incompatible/go.mod h1:pbq4aXjuKjdthFRnoDwaVPLA+WlJuPGy+QneDUgJi2k= github.com/frankban/quicktest v1.4.0/go.mod h1:36zfPVQyHxymz4cH7wlDmVwDrJuljRB60qkgn7rorfQ= github.com/frankban/quicktest v1.10.0/go.mod h1:ui7WezCLWMWxVWr1GETZY3smRy0G4KWq9vcPtJmFl7Y= -github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/frankban/quicktest v1.14.0 h1:+cqqvzZV87b4adx/5ayVOaYZ2CrvM4ejQvUdBzPPUss= +github.com/frankban/quicktest v1.14.0/go.mod h1:NeW+ay9A/U67EYXNFA1nPE8e/tnQv/09mUdL/ijj8og= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= @@ -515,8 +518,9 @@ github.com/godbus/dbus v0.0.0-20180201030542-885f9cc04c9c/go.mod h1:/YcGZj5zSblf github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e h1:BWhy2j3IXJhjCbC68FptL43tDKIq8FladmaTs3Xs7Z8= github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= +github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/googleapis v1.2.0/go.mod h1:Njal3psf3qN6dwBtQfUmBZh2ybovJ0tlu3o/AC7HYjU= github.com/gogo/googleapis v1.4.0/go.mod h1:5YRNX2z1oM5gXdAkurHa942MDgEJyk02w4OecKY87+c= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= @@ -985,8 +989,9 @@ github.com/moby/sys/mount v0.3.0 h1:bXZYMmq7DBQPwHRxH/MG+u9+XF90ZOwoXpHTOznMGp0= github.com/moby/sys/mount v0.3.0/go.mod h1:U2Z3ur2rXPFrFmy4q6WMwWrBOAQGYtYTRVM8BIvzbwk= github.com/moby/sys/mountinfo v0.4.0/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= github.com/moby/sys/mountinfo v0.4.1/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= -github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI= github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= +github.com/moby/sys/mountinfo v0.6.0 h1:gUDhXQx58YNrpHlK4nSL+7y2pxFZkUcXqzFDKWdC0Oo= +github.com/moby/sys/mountinfo v0.6.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= github.com/moby/sys/symlink v0.1.0/go.mod h1:GGDODQmbFOjFsXvfLVn3+ZRxkch54RkSiGqsZeMYowQ= github.com/moby/term v0.0.0-20200312100748-672ec06f55cd/go.mod h1:DdlQx2hp0Ss5/fLikoLlEeIYiATotOjgB//nb973jeo= github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 h1:dcztxKSvZ4Id8iPpHERQBbIJfabdt4wUm5qy3wOL2Zc= @@ -1064,8 +1069,9 @@ github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.m github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mod h1:r3f7wjNzSs2extwzU3Y+6pKfobzPh+kKFJ3ofN+3nfs= github.com/opencontainers/selinux v1.6.0/go.mod h1:VVGKuOLlE7v4PJyT6h7mNWvq1rzqiriPsEqVhc+svHE= github.com/opencontainers/selinux v1.8.0/go.mod h1:RScLhm78qiWa2gbVCcGkC7tCGdgk3ogry1nUQF8Evvo= -github.com/opencontainers/selinux v1.8.2 h1:c4ca10UMgRcvZ6h0K4HtS15UaVSBEaE+iln2LVpAuGc= github.com/opencontainers/selinux v1.8.2/go.mod h1:MUIHuUEvKB1wtJjQdOyYRgOnLD2xAPP8dBsCoU0KuF8= +github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= +github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/packethost/packngo v0.1.1-0.20180711074735-b9cb5096f54c h1:vwpFWvAO8DeIZfFeqASzZfsxuWPno9ncAebBEP0N3uE= @@ -1160,8 +1166,8 @@ github.com/sclevine/agouti v3.0.0+incompatible/go.mod h1:b4WX9W9L1sfQKXeJf1mUTLZ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= -github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128 h1:vWI7jlfJVWN//T8jrt5JduYkSid+Sl/fRz33J1jQ83k= -github.com/seccomp/libseccomp-golang v0.9.2-0.20200314001724-bdab42bd5128/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/sergi/go-diff v1.0.0 h1:Kpca3qRNrduNnOQeazBd0ysaKrUJiIuISHxogkT9RPQ= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/shirou/gopsutil v0.0.0-20181107111621-48177ef5f880 h1:1Ge4j/3uB2rxzPWD3TC+daeCw+w91z8UCUL/7WH5gn8= @@ -1570,6 +1576,7 @@ golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210816074244-15123e1e1f71/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210816183151-1e6c022a8912/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210917161153-d61c044b1678/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211013075003-97ac67df715c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/lib/cpuset/cpuset.go b/lib/cpuset/cpuset.go index 0caa966710d3..9cea06cad877 100644 --- a/lib/cpuset/cpuset.go +++ b/lib/cpuset/cpuset.go @@ -28,6 +28,17 @@ func New(cpus ...uint16) CPUSet { return cpuset } +// Copy returns a deep copy of CPUSet c. +func (c CPUSet) Copy() CPUSet { + cpus := make(map[uint16]struct{}, len(c.cpus)) + for k := range c.cpus { + cpus[k] = struct{}{} + } + return CPUSet{ + cpus: cpus, + } +} + // String returns the cpuset as a comma delimited set of core values and ranged func (c CPUSet) String() string { if c.Size() == 0 { diff --git a/lib/cpuset/cpuset_test.go b/lib/cpuset/cpuset_test.go index 178517680b12..e417744b03d8 100644 --- a/lib/cpuset/cpuset_test.go +++ b/lib/cpuset/cpuset_test.go @@ -207,7 +207,7 @@ func TestParse(t *testing.T) { func TestCPUSet_String(t *testing.T) { ci.Parallel(t) - + cases := []struct { cpuset CPUSet expected string @@ -222,3 +222,16 @@ func TestCPUSet_String(t *testing.T) { require.Equal(t, c.expected, c.cpuset.String()) } } + +func TestCPUSet_Copy(t *testing.T) { + ci.Parallel(t) + + original := New(1, 2, 3, 4, 5) + copied := original.Copy() + require.True(t, original.Equals(copied)) + + delete(copied.cpus, 3) + require.False(t, original.Equals(copied)) + require.True(t, original.Equals(New(1, 2, 3, 4, 5))) + require.True(t, copied.Equals(New(1, 2, 4, 5))) +} diff --git a/plugins/drivers/driver.go b/plugins/drivers/driver.go index f47dc5a2815b..4b8cd7d13531 100644 --- a/plugins/drivers/driver.go +++ b/plugins/drivers/driver.go @@ -267,7 +267,7 @@ type TaskConfig struct { JobName string JobID string TaskGroupName string - Name string + Name string // task.Name Namespace string NodeName string NodeID string diff --git a/plugins/drivers/testutils/exec_testing.go b/plugins/drivers/testutils/exec_testing.go index 8d824a16d9dc..a1cd361b11d9 100644 --- a/plugins/drivers/testutils/exec_testing.go +++ b/plugins/drivers/testutils/exec_testing.go @@ -14,8 +14,10 @@ import ( "testing" "time" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/plugins/drivers" dproto "github.com/hashicorp/nomad/plugins/drivers/proto" + "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" ) @@ -192,9 +194,35 @@ func TestExecFSIsolation(t *testing.T, driver *DriverHarness, taskID string) { false, "") require.Zero(t, r.exitCode) - if !strings.Contains(r.stdout, ":freezer:/nomad") && !strings.Contains(r.stdout, "freezer:/docker") { - require.Fail(t, "unexpected freezer cgroup", "expected freezer to be /nomad/ or /docker/, but found:\n%s", r.stdout) + if !cgutil.UseV2 { + acceptable := []string{ + ":freezer:/nomad", ":freezer:/docker", + } + if testutil.IsCI() { + // github actions freezer cgroup + acceptable = append(acceptable, ":freezer:/actions_job") + } + ok := false + for _, freezerGroup := range acceptable { + if strings.Contains(r.stdout, freezerGroup) { + ok = true + break + } + } + if !ok { + require.Fail(t, "unexpected freezer cgroup", "expected freezer to be /nomad/ or /docker/, but found:\n%s", r.stdout) + } + } else { + info, _ := driver.PluginInfo() + if info.Name == "docker" { + // Note: docker on cgroups v2 now returns nothing + // root@97b4d3d33035:/# cat /proc/self/cgroup + // 0::/ + t.Skip("/proc/self/cgroup not useful in docker cgroups.v2") + } + // e.g. 0::/testing.slice/5bdbd6c2-8aba-3ab2-728b-0ff3a81727a9.sleep.scope + require.True(t, strings.HasSuffix(strings.TrimSpace(r.stdout), ".scope"), "actual stdout %q", r.stdout) } }) } diff --git a/plugins/drivers/testutils/testing.go b/plugins/drivers/testutils/testing.go index 5056344e0824..f0f6c6713c2c 100644 --- a/plugins/drivers/testutils/testing.go +++ b/plugins/drivers/testutils/testing.go @@ -4,17 +4,17 @@ import ( "context" "fmt" "io/ioutil" + "os" "path/filepath" "runtime" "strings" "time" - testing "github.com/mitchellh/go-testing-interface" - hclog "github.com/hashicorp/go-hclog" plugin "github.com/hashicorp/go-plugin" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/logmon" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/helper/testlog" @@ -24,6 +24,7 @@ import ( "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/drivers" "github.com/hashicorp/nomad/plugins/shared/hclspec" + testing "github.com/mitchellh/go-testing-interface" "github.com/stretchr/testify/require" ) @@ -34,6 +35,7 @@ type DriverHarness struct { t testing.T logger hclog.Logger impl drivers.DriverPlugin + cgroup string } func (h *DriverHarness) Impl() drivers.DriverPlugin { @@ -53,12 +55,10 @@ func NewDriverHarness(t testing.T, d drivers.DriverPlugin) *DriverHarness { ) raw, err := client.Dispense(base.PluginTypeDriver) - if err != nil { - t.Fatalf("err dispensing plugin: %v", err) - } + require.NoError(t, err, "failed to dispense plugin") dClient := raw.(drivers.DriverPlugin) - h := &DriverHarness{ + return &DriverHarness{ client: client, server: server, DriverPlugin: dClient, @@ -66,13 +66,36 @@ func NewDriverHarness(t testing.T, d drivers.DriverPlugin) *DriverHarness { t: t, impl: d, } +} - return h +// setCgroup creates a v2 cgroup for the task, as if a Client were initialized +// and managing the cgroup as it normally would. +// +// Uses testing.slice as a parent. +func (h *DriverHarness) setCgroup(allocID, task string) { + if cgutil.UseV2 { + h.cgroup = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, task)) + f, err := os.Create(h.cgroup) + if err != nil { + panic(err) + } + defer f.Close() + } } func (h *DriverHarness) Kill() { - h.client.Close() + _ = h.client.Close() h.server.Stop() + h.cleanupCgroup() +} + +func (h *DriverHarness) cleanupCgroup() { + if cgutil.UseV2 { + err := os.Remove(h.cgroup) + if err != nil { + panic(err) + } + } } // tinyChroot is useful for testing, where we do not use anything other than @@ -139,6 +162,7 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( // Create the mock allocation alloc := mock.Alloc() + alloc.ID = t.AllocID if t.Resources != nil { alloc.AllocatedResources.Tasks[task.Name] = t.Resources.NomadResources } @@ -157,6 +181,9 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( } } + // set cgroup + h.setCgroup(alloc.ID, task.Name) + //logmon if enableLogs { lm := logmon.NewLogMon(h.logger.Named("logmon")) @@ -189,6 +216,7 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func( return func() { h.client.Close() allocDir.Destroy() + h.cleanupCgroup() } } diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 56d2d7c44f56..874d2158e473 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -94,6 +94,29 @@ be removed in future releases. The previous `Protocol` value can be viewed using the `-verbose` flag. +#### Linux Control Groups Version 2 + +Starting with Nomad 1.3.0, Linux systems configured to use [cgroups v2][cgroups2] are +now supported. A Nomad client will only activate its v2 control groups manager if the +system is configured with the cgroups2 controller mounted at `/sys/fs/cgroup`. This implies +Nomad will continue to fallback to the v1 control groups manager on systems +configured to run in hybrid mode, where the cgroups2 controller is typically mounted +at `/sys/fs/cgroup/unified`. Systems that do not support cgroups v2 are not affected. A +new client attribute `unique.cgroup.version` indicates which version of control groups +Nomad is using. + +When cgroups v2 are in use, Nomad uses `nomad.slice` as the [default parent][cgroup_parent] for cgroups +created on behalf of tasks. The cgroup created for a task is named in the form `-.scope`. +These cgroups are created by Nomad before a task starts. External task drivers that support +containerization should be updated to make use of the new cgroup locations. + +``` +➜ tree -d /sys/fs/cgroup/nomad.slice +/sys/fs/cgroup/nomad.slice +├── 8b8da4cf-8ebf-b578-0bcf-77190749abf3.redis.scope +└── a8c8e495-83c8-311b-4657-e6e3127e98bc.example.scope +``` + ## Nomad 1.2.6, 1.1.12, and 1.0.18 #### ACL requirement for the job parse endpoint @@ -1262,6 +1285,8 @@ state. Once that is done the client can be killed, the `data_dir` should be deleted and then Nomad 0.3.0 can be launched. [api_jobs_parse]: /api-docs/jobs#parse-job +[cgroups2]: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html +[cgroup_parent]: /docs/configuration/client#cgroup_parent [dangling-containers]: /docs/drivers/docker#dangling-containers [drain-api]: /api-docs/nodes#drain-node [drain-cli]: /docs/commands/node/drain From 8cc812197e3c742871aa779163b6d041f5de9b27 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 23 Mar 2022 19:12:51 -0400 Subject: [PATCH 06/16] build(deps): bump github.com/stretchr/testify from 1.7.0 to 1.7.1 (#12306) --- api/go.mod | 2 +- api/go.sum | 4 ++-- go.mod | 2 +- go.sum | 3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/api/go.mod b/api/go.mod index 37c7373d1b5d..9619f2ae2faa 100644 --- a/api/go.mod +++ b/api/go.mod @@ -11,7 +11,7 @@ require ( github.com/kr/pretty v0.3.0 github.com/mitchellh/go-testing-interface v1.14.1 github.com/mitchellh/mapstructure v1.4.3 - github.com/stretchr/testify v1.7.0 + github.com/stretchr/testify v1.7.1 ) require ( diff --git a/api/go.sum b/api/go.sum index b3c72ca5214f..a2ab1719ebf6 100644 --- a/api/go.sum +++ b/api/go.sum @@ -30,8 +30,8 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= diff --git a/go.mod b/go.mod index 0218687a452d..dbe6485c0fce 100644 --- a/go.mod +++ b/go.mod @@ -110,7 +110,7 @@ require ( github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 github.com/shirou/gopsutil/v3 v3.21.12 github.com/skratchdot/open-golang v0.0.0-20160302144031-75fb7ed4208c - github.com/stretchr/testify v1.7.0 + github.com/stretchr/testify v1.7.1 github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 github.com/zclconf/go-cty v1.8.0 github.com/zclconf/go-cty-yaml v1.0.2 diff --git a/go.sum b/go.sum index 9a472d9f02ae..cf61bb5ea80b 100644 --- a/go.sum +++ b/go.sum @@ -1215,8 +1215,9 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= From 90160fc40c4358005bde59c6300e1790c3b69b08 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Thu, 24 Mar 2022 08:23:57 -0500 Subject: [PATCH 07/16] core: write peers.json file with correct permissions --- .changelog/12369.txt | 3 +++ nomad/server.go | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 .changelog/12369.txt diff --git a/.changelog/12369.txt b/.changelog/12369.txt new file mode 100644 index 000000000000..9bd255b12dc9 --- /dev/null +++ b/.changelog/12369.txt @@ -0,0 +1,3 @@ +```release-note:bug +Write peers.json file with correct permissions +``` diff --git a/nomad/server.go b/nomad/server.go index 5e3d2eb51ac2..f30d903c81f4 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1330,7 +1330,7 @@ func (s *Server) setupRaft() error { peersFile := filepath.Join(path, "peers.json") peersInfoFile := filepath.Join(path, "peers.info") if _, err := os.Stat(peersInfoFile); os.IsNotExist(err) { - if err := ioutil.WriteFile(peersInfoFile, []byte(peersInfoContent), 0755); err != nil { + if err := ioutil.WriteFile(peersInfoFile, []byte(peersInfoContent), 0644); err != nil { return fmt.Errorf("failed to write peers.info file: %v", err) } From a7e5df8381386662548d57e7f26efe999f72c7b3 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 24 Mar 2022 10:29:50 -0400 Subject: [PATCH 08/16] csi: add `-secret` and `-parameter` flag to `volume snapshot create` (#12360) Pass-through the `-secret` and `-parameter` flags to allow setting parameters for the snapshot and overriding the secrets we've stored on the CSI volume in the state store. --- .changelog/12360.txt | 3 ++ api/csi.go | 4 ++ command/volume_snapshot_create.go | 39 +++++++++++++++++++ command/volume_snapshot_delete.go | 5 ++- command/volume_snapshot_list.go | 17 +++++--- nomad/csi_endpoint.go | 8 +++- .../docs/commands/volume/snapshot-create.mdx | 11 ++++++ .../docs/commands/volume/snapshot-list.mdx | 5 ++- 8 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 .changelog/12360.txt diff --git a/.changelog/12360.txt b/.changelog/12360.txt new file mode 100644 index 000000000000..8422c38cd606 --- /dev/null +++ b/.changelog/12360.txt @@ -0,0 +1,3 @@ +```release-note:improvement +csi: Added `-secret` and `-parameter` flags to `volume snapshot create` command +``` diff --git a/api/csi.go b/api/csi.go index 25cddc50ba80..829e3a2517bb 100644 --- a/api/csi.go +++ b/api/csi.go @@ -120,6 +120,10 @@ func (v *CSIVolumes) CreateSnapshot(snap *CSISnapshot, w *WriteOptions) (*CSISna req := &CSISnapshotCreateRequest{ Snapshots: []*CSISnapshot{snap}, } + if w == nil { + w = &WriteOptions{} + } + w.SetHeadersFromCSISecrets(snap.Secrets) resp := &CSISnapshotCreateResponse{} meta, err := v.client.write("/v1/volumes/snapshot", req, resp, w) return resp, meta, err diff --git a/command/volume_snapshot_create.go b/command/volume_snapshot_create.go index 4fcfdcd73455..013b285c0e42 100644 --- a/command/volume_snapshot_create.go +++ b/command/volume_snapshot_create.go @@ -6,6 +6,7 @@ import ( "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/api/contexts" + flaghelper "github.com/hashicorp/nomad/helper/flags" "github.com/posener/complete" ) @@ -34,7 +35,20 @@ General Options: ` + generalOptionsUsage(usageOptsDefault) + ` +Snapshot Create Options: + + -parameter + Parameters to pass to the plugin to create a snapshot. Accepts multiple + flags in the form -parameter key=value + + -secret + Secrets to pass to the plugin to create snapshot. Accepts multiple + flags in the form -secret key=value + + -verbose + Display full information for the resulting snapshot. ` + return strings.TrimSpace(helpText) } @@ -70,7 +84,11 @@ func (c *VolumeSnapshotCreateCommand) Run(args []string) int { flags.Usage = func() { c.Ui.Output(c.Help()) } var verbose bool + var parametersArgs flaghelper.StringFlag + var secretsArgs flaghelper.StringFlag flags.BoolVar(&verbose, "verbose", false, "") + flags.Var(¶metersArgs, "parameter", "parameters for snapshot, ex. -parameter key=value") + flags.Var(&secretsArgs, "secret", "secrets for snapshot, ex. -secret key=value") if err := flags.Parse(args); err != nil { c.Ui.Error(fmt.Sprintf("Error parsing arguments %s", err)) @@ -97,9 +115,30 @@ func (c *VolumeSnapshotCreateCommand) Run(args []string) int { return 1 } + secrets := api.CSISecrets{} + for _, kv := range secretsArgs { + s := strings.Split(kv, "=") + if len(s) == 2 { + secrets[s[0]] = s[1] + } else { + c.Ui.Error("Secret must be in the format: -secret key=value") + return 1 + } + } + + params := map[string]string{} + for _, kv := range parametersArgs { + p := strings.Split(kv, "=") + if len(p) == 2 { + params[p[0]] = p[1] + } + } + snaps, _, err := client.CSIVolumes().CreateSnapshot(&api.CSISnapshot{ SourceVolumeID: volID, Name: snapshotName, + Secrets: secrets, + Parameters: params, }, nil) if err != nil { c.Ui.Error(fmt.Sprintf("Error snapshotting volume: %s", err)) diff --git a/command/volume_snapshot_delete.go b/command/volume_snapshot_delete.go index a2ac23c7b3cd..d6eaccc1cd80 100644 --- a/command/volume_snapshot_delete.go +++ b/command/volume_snapshot_delete.go @@ -77,7 +77,7 @@ func (c *VolumeSnapshotDeleteCommand) Run(args []string) int { } // Check that we get exactly two arguments args = flags.Args() - if l := len(args); l != 2 { + if l := len(args); l < 2 { c.Ui.Error("This command takes two arguments: ") c.Ui.Error(commandErrorText(c)) return 1 @@ -97,6 +97,9 @@ func (c *VolumeSnapshotDeleteCommand) Run(args []string) int { s := strings.Split(kv, "=") if len(s) == 2 { secrets[s[0]] = s[1] + } else { + c.Ui.Error("Secret must be in the format: -secret key=value") + return 1 } } diff --git a/command/volume_snapshot_list.go b/command/volume_snapshot_list.go index a27e71bfaaf8..df0f925e90be 100644 --- a/command/volume_snapshot_list.go +++ b/command/volume_snapshot_list.go @@ -36,6 +36,12 @@ General Options: List Options: + -page-token + Where to start pagination. + + -per-page + How many results to show per page. Defaults to 30. + -plugin: Display only snapshots managed by a particular plugin. This parameter is required. @@ -43,12 +49,10 @@ List Options: Secrets to pass to the plugin to list snapshots. Accepts multiple flags in the form -secret key=value - -per-page - How many results to show per page. Defaults to 30. - - -page-token - Where to start pagination. + -verbose + Display full information for snapshots. ` + return strings.TrimSpace(helpText) } @@ -139,6 +143,9 @@ func (c *VolumeSnapshotListCommand) Run(args []string) int { s := strings.Split(kv, "=") if len(s) == 2 { secrets[s[0]] = s[1] + } else { + c.Ui.Error("Secret must be in the format: -secret key=value") + return 1 } } diff --git a/nomad/csi_endpoint.go b/nomad/csi_endpoint.go index add9c3cd8b80..4043955cc2d3 100644 --- a/nomad/csi_endpoint.go +++ b/nomad/csi_endpoint.go @@ -1195,10 +1195,16 @@ func (v *CSIVolume) CreateSnapshot(args *structs.CSISnapshotCreateRequest, reply continue } + secrets := vol.Secrets + for k, v := range snap.Secrets { + // merge request secrets onto volume secrets + secrets[k] = v + } + cReq := &cstructs.ClientCSIControllerCreateSnapshotRequest{ ExternalSourceVolumeID: vol.ExternalID, Name: snap.Name, - Secrets: vol.Secrets, + Secrets: secrets, Parameters: snap.Parameters, } cReq.PluginID = pluginID diff --git a/website/content/docs/commands/volume/snapshot-create.mdx b/website/content/docs/commands/volume/snapshot-create.mdx index 241e18e0ee7a..24b5b9556bf6 100644 --- a/website/content/docs/commands/volume/snapshot-create.mdx +++ b/website/content/docs/commands/volume/snapshot-create.mdx @@ -1,5 +1,6 @@ --- layout: docs + page_title: 'Commands: volume snapshot create' description: | Create external volume snapshots. @@ -35,6 +36,16 @@ When ACLs are enabled, this command requires a token with the @include 'general_options.mdx' +## Snapshot Create Options + +- `-parameter`: Parameters to pass to the plugin to create a + snapshot. Accepts multiple flags in the form `-parameter key=value` + +- `-secret`: Secrets to pass to the plugin to create a snapshot. Accepts + multiple flags in the form `-secret key=value` + +- `-verbose`: Display full information for the resulting snapshot. + ## Examples Snapshot a volume: diff --git a/website/content/docs/commands/volume/snapshot-list.mdx b/website/content/docs/commands/volume/snapshot-list.mdx index 747de707f9aa..2e2c6649397c 100644 --- a/website/content/docs/commands/volume/snapshot-list.mdx +++ b/website/content/docs/commands/volume/snapshot-list.mdx @@ -29,6 +29,8 @@ Nomad. ## Snapshot List Options +- `-page-token`: Where to start pagination. +- `-per-page`: How many results to show per page. - `-plugin`: Display only snapshots managed by a particular [CSI plugin][csi_plugin]. This flag is required and accepts a plugin ID or prefix. If there is an exact match based on the provided plugin, @@ -36,8 +38,7 @@ Nomad. matching plugins will be displayed. - `-secret`: Secrets to pass to the plugin to list snapshots. Accepts multiple flags in the form `-secret key=value` -- `-per-page`: How many results to show per page. -- `-page-token`: Where to start pagination. +- `-verbose`: Display full information for the resulting snapshot. When ACLs are enabled, this command requires a token with the `csi-list-volumes` capability for the plugin's namespace. From c27af79add16a32cb4fd0a6cc1190af8424b9ffc Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Thu, 24 Mar 2022 13:40:42 -0500 Subject: [PATCH 09/16] client: cgroups v2 code review followup --- client/lib/cgutil/cgutil_linux.go | 12 ++++---- client/lib/cgutil/cgutil_linux_test.go | 6 ++-- client/lib/cgutil/cpuset_manager_v1.go | 2 +- client/lib/cgutil/cpuset_manager_v1_test.go | 14 ++++------ client/lib/cgutil/cpuset_manager_v2.go | 15 +++++----- client/lib/cgutil/cpuset_manager_v2_test.go | 2 ++ client/testutil/driver_compatible.go | 9 ++++++ client/testutil/driver_compatible_linux.go | 5 +--- .../shared/executor/executor_linux_test.go | 4 +-- .../content/docs/upgrade/upgrade-specific.mdx | 28 ++++++++++++------- 10 files changed, 57 insertions(+), 40 deletions(-) diff --git a/client/lib/cgutil/cgutil_linux.go b/client/lib/cgutil/cgutil_linux.go index 349660ec3e6b..33b2917ec40f 100644 --- a/client/lib/cgutil/cgutil_linux.go +++ b/client/lib/cgutil/cgutil_linux.go @@ -32,24 +32,26 @@ func GetCgroupParent(parent string) string { // CreateCPUSetManager creates a V1 or V2 CpusetManager depending on system configuration. func CreateCPUSetManager(parent string, logger hclog.Logger) CpusetManager { + parent = GetCgroupParent(parent) // use appropriate default parent if not set in client config if UseV2 { - return NewCpusetManagerV2(getParentV2(parent), logger.Named("cpuset.v2")) + return NewCpusetManagerV2(parent, logger.Named("cpuset.v2")) } - return NewCpusetManagerV1(getParentV1(parent), logger.Named("cpuset.v1")) + return NewCpusetManagerV1(parent, logger.Named("cpuset.v1")) } // GetCPUsFromCgroup gets the effective cpuset value for the given cgroup. func GetCPUsFromCgroup(group string) ([]uint16, error) { + group = GetCgroupParent(group) if UseV2 { - return getCPUsFromCgroupV2(getParentV2(group)) + return getCPUsFromCgroupV2(group) } - return getCPUsFromCgroupV1(getParentV1(group)) + return getCPUsFromCgroupV1(group) } // CgroupScope returns the name of the scope for Nomad's managed cgroups for // the given allocID and task. // -// e.g. "-.scope" +// e.g. "..scope" // // Only useful for v2. func CgroupScope(allocID, task string) string { diff --git a/client/lib/cgutil/cgutil_linux_test.go b/client/lib/cgutil/cgutil_linux_test.go index bbc18ef8c3e9..ed3ae87bd850 100644 --- a/client/lib/cgutil/cgutil_linux_test.go +++ b/client/lib/cgutil/cgutil_linux_test.go @@ -88,8 +88,10 @@ func TestUtil_GetCPUsFromCgroup(t *testing.T) { func create(t *testing.T, name string) { mgr, err := fs2.NewManager(nil, filepath.Join(CgroupRoot, name), rootless) require.NoError(t, err) - err = mgr.Apply(CreationPID) - require.NoError(t, err) + if err = mgr.Apply(CreationPID); err != nil { + _ = cgroups.RemovePath(name) + t.Fatal("failed to create cgroup for test") + } } func cleanup(t *testing.T, name string) { diff --git a/client/lib/cgutil/cpuset_manager_v1.go b/client/lib/cgutil/cpuset_manager_v1.go index 0e910705290e..f0fa32527467 100644 --- a/client/lib/cgutil/cpuset_manager_v1.go +++ b/client/lib/cgutil/cpuset_manager_v1.go @@ -117,12 +117,12 @@ func (c *cpusetManagerV1) CgroupPathFor(allocID, task string) CgroupPathGetter { break } - timer.Reset(100 * time.Millisecond) if _, err := os.Stat(taskInfo.CgroupPath); os.IsNotExist(err) { select { case <-ctx.Done(): return taskInfo.CgroupPath, ctx.Err() case <-timer.C: + timer.Reset(100 * time.Millisecond) continue } } diff --git a/client/lib/cgutil/cpuset_manager_v1_test.go b/client/lib/cgutil/cpuset_manager_v1_test.go index 3ada7b551a44..9537f2f87a86 100644 --- a/client/lib/cgutil/cpuset_manager_v1_test.go +++ b/client/lib/cgutil/cpuset_manager_v1_test.go @@ -106,18 +106,16 @@ func TestCpusetManager_V1_AddAlloc_single(t *testing.T) { func TestCpusetManager_V1_RemoveAlloc(t *testing.T) { testutil.CgroupsCompatibleV1(t) + // This case tests adding 2 allocations, reconciling then removing 1 alloc. + // It requires the system to have at least 3 cpu cores (one for each alloc), + // BUT plus another one because writing an empty cpuset causes the cgroup to + // inherit the parent. + testutil.MinimumCores(t, 3) + manager, cleanup := tmpCpusetManagerV1(t) defer cleanup() require.NoError(t, manager.Init(nil)) - // This case tests adding 2 allocs, reconciling then removing 1 alloc - // it requires the system to have at least 3 cpu cores (one for each alloc), - // BUT plus another one because writing an empty cpuset causes the cgroup to - // inherit the parent. - if manager.parentCpuset.Size() < 3 { - t.Skip("test requires at least 3 cpu cores") - } - alloc1 := mock.Alloc() alloc1Cpuset := cpuset.New(manager.parentCpuset.ToSlice()[0]) alloc1.AllocatedResources.Tasks["web"].Cpu.ReservedCores = alloc1Cpuset.ToSlice() diff --git a/client/lib/cgutil/cpuset_manager_v2.go b/client/lib/cgutil/cpuset_manager_v2.go index 00162ca523a5..74a8a4f4f74d 100644 --- a/client/lib/cgutil/cpuset_manager_v2.go +++ b/client/lib/cgutil/cpuset_manager_v2.go @@ -40,8 +40,8 @@ const ( // nothing is used for treating a map like a set with no values type nothing struct{} -// null represents nothing -var null = nothing{} +// present indicates something exists +var present = nothing{} type cpusetManagerV2 struct { logger hclog.Logger @@ -57,10 +57,9 @@ type cpusetManagerV2 struct { } func NewCpusetManagerV2(parent string, logger hclog.Logger) CpusetManager { - cgroupParent := getParentV2(parent) return &cpusetManagerV2{ - parent: cgroupParent, - parentAbs: filepath.Join(CgroupRoot, cgroupParent), + parent: parent, + parentAbs: filepath.Join(CgroupRoot, parent), logger: logger, sharing: make(map[identity]nothing), isolating: make(map[identity]cpuset.CPUSet), @@ -93,7 +92,7 @@ func (c *cpusetManagerV2) AddAlloc(alloc *structs.Allocation) { if len(resources.Cpu.ReservedCores) > 0 { c.isolating[id] = cpuset.New(resources.Cpu.ReservedCores...) } else { - c.sharing[id] = null + c.sharing[id] = present } } @@ -197,10 +196,10 @@ func (c *cpusetManagerV2) cleanup() { size := len(c.sharing) + len(c.isolating) ids := make(map[identity]nothing, size) for id := range c.sharing { - ids[id] = null + ids[id] = present } for id := range c.isolating { - ids[id] = null + ids[id] = present } if err := filepath.WalkDir(c.parentAbs, func(path string, entry os.DirEntry, err error) error { diff --git a/client/lib/cgutil/cpuset_manager_v2_test.go b/client/lib/cgutil/cpuset_manager_v2_test.go index e1d658c82589..a6acc50e76f7 100644 --- a/client/lib/cgutil/cpuset_manager_v2_test.go +++ b/client/lib/cgutil/cpuset_manager_v2_test.go @@ -24,6 +24,7 @@ var systemCores = []uint16{0, 1} func TestCpusetManager_V2_AddAlloc(t *testing.T) { testutil.CgroupsCompatibleV2(t) + testutil.MinimumCores(t, 2) logger := testlog.HCLogger(t) parent := uuid.Short() + ".scope" @@ -63,6 +64,7 @@ func cpusetIs(t *testing.T, exp, parent, allocID, task string) { func TestCpusetManager_V2_RemoveAlloc(t *testing.T) { testutil.CgroupsCompatibleV2(t) + testutil.MinimumCores(t, 2) logger := testlog.HCLogger(t) parent := uuid.Short() + ".scope" diff --git a/client/testutil/driver_compatible.go b/client/testutil/driver_compatible.go index 2459f8aa6c86..f0f84d5e52e6 100644 --- a/client/testutil/driver_compatible.go +++ b/client/testutil/driver_compatible.go @@ -92,3 +92,12 @@ func MountCompatible(t *testing.T) { t.Skip("Test requires root") } } + +// MinimumCores skips tests unless: +// - system has at least cores available CPU cores +func MinimumCores(t *testing.T, cores int) { + available := runtime.NumCPU() + if available < cores { + t.Skipf("Test requires at least %d cores, only %d available", cores, available) + } +} diff --git a/client/testutil/driver_compatible_linux.go b/client/testutil/driver_compatible_linux.go index 84ace6015f74..f30aba7ef386 100644 --- a/client/testutil/driver_compatible_linux.go +++ b/client/testutil/driver_compatible_linux.go @@ -3,7 +3,6 @@ package testutil import ( - "runtime" "testing" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -23,9 +22,7 @@ func CgroupsCompatibleV1(t *testing.T) { } func cgroupsCompatibleV1(t *testing.T) bool { - if runtime.GOOS != "linux" { - return false - } + // build tags mean this will never run outside of linux if cgroupsCompatibleV2(t) { t.Log("No cgroup.v1 mount point: running in cgroup.v2 mode") diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index d38072ad561c..585bf2d17308 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -371,8 +371,8 @@ func TestExecutor_CgroupPathsAreDestroyed(t *testing.T) { } // Skip rdma subsystem; rdma was added in most recent kernels and libcontainer/docker - // don't isolate it by default. - if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") { + // don't isolate it by default. And also misc. + if strings.Contains(line, ":rdma:") || strings.Contains(line, "::") || strings.Contains(line, ":misc:") { continue } diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 874d2158e473..73b6d63e87af 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -96,21 +96,28 @@ The previous `Protocol` value can be viewed using the `-verbose` flag. #### Linux Control Groups Version 2 -Starting with Nomad 1.3.0, Linux systems configured to use [cgroups v2][cgroups2] are -now supported. A Nomad client will only activate its v2 control groups manager if the -system is configured with the cgroups2 controller mounted at `/sys/fs/cgroup`. This implies -Nomad will continue to fallback to the v1 control groups manager on systems -configured to run in hybrid mode, where the cgroups2 controller is typically mounted -at `/sys/fs/cgroup/unified`. Systems that do not support cgroups v2 are not affected. A -new client attribute `unique.cgroup.version` indicates which version of control groups -Nomad is using. +Starting with Nomad 1.3.0, Linux systems configured to use [cgroups v2][cgroups2] +are now supported. A Nomad client will only activate its v2 control groups manager +if the system is configured with the cgroups2 controller mounted at `/sys/fs/cgroup`. + * Systems that do not support cgroups v2 are not affected. + * Systems configured in hybrid mode typically mount the cgroups2 + controller at `/sys/fs/cgroup/unified`, so Nomad will continue to + use cgroups v1 for these hosts. + * Systems configured with only cgroups v2 now correctly support setting cpu [cores]. + +Nomad will preserve the existing cgroup for tasks when a client is +upgraded, so there will be no disruption to tasks. A new client +attribute `unique.cgroup.version` indicates which version of control +groups Nomad is using. When cgroups v2 are in use, Nomad uses `nomad.slice` as the [default parent][cgroup_parent] for cgroups -created on behalf of tasks. The cgroup created for a task is named in the form `-.scope`. +created on behalf of tasks. The cgroup created for a task is named in the form `..scope`. These cgroups are created by Nomad before a task starts. External task drivers that support containerization should be updated to make use of the new cgroup locations. -``` +The new cgroup file system layout will look like the following: + +```shell-session ➜ tree -d /sys/fs/cgroup/nomad.slice /sys/fs/cgroup/nomad.slice ├── 8b8da4cf-8ebf-b578-0bcf-77190749abf3.redis.scope @@ -1287,6 +1294,7 @@ deleted and then Nomad 0.3.0 can be launched. [api_jobs_parse]: /api-docs/jobs#parse-job [cgroups2]: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html [cgroup_parent]: /docs/configuration/client#cgroup_parent +[cores]: /docs/job-specification/resources#cores [dangling-containers]: /docs/drivers/docker#dangling-containers [drain-api]: /api-docs/nodes#drain-node [drain-cli]: /docs/commands/node/drain From 0783ac6eff8072e940905ca8faa3c9d044f29074 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 24 Mar 2022 14:42:00 -0400 Subject: [PATCH 10/16] core: store and check for Raft version changes (#12362) Downgrading the Raft version protocol is not a supported operation. Checking for a downgrade is hard since this information is not stored in any persistent place. When a server re-joins a cluster with a prior Raft version, the Serf tag is updated so Nomad can't tell that the version changed. Mixed version clusters must be supported to allow for zero-downtime rolling upgrades. During this it's expected that the cluster will have mixed Raft versions. Enforcing consistency strong version consistency would disrupt this flow. The approach taken here is to store the Raft version on disk. When the server starts the `raft_protocol` value is written to the file `data_dir/raft/version`. If that file already exists, its content is checked against the current `raft_protocol` value to detect downgrades and prevent the server from starting. Any other types of errors are ignore to prevent disruptions that are outside the control of operators. The only option in cases of an invalid or corrupt file would be to delete it, making this check useless. So just overwrite its content with the new version and provide guidance on how to check that their cluster is an expected state. --- .changelog/12362.txt | 3 +++ nomad/server.go | 46 ++++++++++++++++++++++++++++++++++++++++++++ nomad/server_test.go | 24 +++++++++++++++++++++++ nomad/testing.go | 13 ++++++++++--- 4 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 .changelog/12362.txt diff --git a/.changelog/12362.txt b/.changelog/12362.txt new file mode 100644 index 000000000000..7a8dcca5e585 --- /dev/null +++ b/.changelog/12362.txt @@ -0,0 +1,3 @@ +```release-note:improvement +server: store and check previous Raft protocol version to prevent downgrades +``` diff --git a/nomad/server.go b/nomad/server.go index f30d903c81f4..d2da13203716 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1284,6 +1284,16 @@ func (s *Server) setupRaft() error { return err } + // Check Raft version and update the version file. + raftVersionFilePath := filepath.Join(path, "version") + raftVersionFileContent := strconv.Itoa(int(s.config.RaftConfig.ProtocolVersion)) + if err := s.checkRaftVersionFile(raftVersionFilePath); err != nil { + return err + } + if err := ioutil.WriteFile(raftVersionFilePath, []byte(raftVersionFileContent), 0644); err != nil { + return fmt.Errorf("failed to write Raft version file: %v", err) + } + // Create the BoltDB backend, with NoFreelistSync option store, raftErr := raftboltdb.New(raftboltdb.Options{ Path: filepath.Join(path, "raft.db"), @@ -1399,6 +1409,42 @@ func (s *Server) setupRaft() error { return nil } +// checkRaftVersionFile reads the Raft version file and returns an error if +// the Raft version is incompatible with the current version configured. +// Provide best-effort check if the file cannot be read. +func (s *Server) checkRaftVersionFile(path string) error { + raftVersion := s.config.RaftConfig.ProtocolVersion + baseWarning := "use the 'nomad operator raft list-peers' command to make sure the Raft protocol versions are consistent" + + _, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + + s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err) + return nil + } + + v, err := ioutil.ReadFile(path) + if err != nil { + s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err) + return nil + } + + previousVersion, err := strconv.Atoi(strings.TrimSpace(string(v))) + if err != nil { + s.logger.Warn(fmt.Sprintf("invalid Raft protocol version in Raft version file, %s", baseWarning), "error", err) + return nil + } + + if raft.ProtocolVersion(previousVersion) > raftVersion { + return fmt.Errorf("downgrading Raft is not supported, current version is %d, previous version was %d", raftVersion, previousVersion) + } + + return nil +} + // setupSerf is used to setup and initialize a Serf func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) { conf.Init() diff --git a/nomad/server_test.go b/nomad/server_test.go index db1b1091e22d..858ac0dd715f 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -645,3 +645,27 @@ func TestServer_ReloadSchedulers_InvalidSchedulers(t *testing.T) { currentWC = s.GetSchedulerWorkerConfig() require.Equal(t, origWC, currentWC) } + +func TestServer_PreventRaftDowngrade(t *testing.T) { + ci.Parallel(t) + + dir := t.TempDir() + _, cleanupv3 := TestServer(t, func(c *Config) { + c.DevMode = false + c.DataDir = dir + c.RaftConfig.ProtocolVersion = 3 + }) + cleanupv3() + + _, cleanupv2, err := TestServerErr(t, func(c *Config) { + c.DevMode = false + c.DataDir = dir + c.RaftConfig.ProtocolVersion = 2 + }) + if cleanupv2 != nil { + defer cleanupv2() + } + + // Downgrading Raft should prevent the server from starting. + require.Error(t, err) +} diff --git a/nomad/testing.go b/nomad/testing.go index 9fbe2ca02e41..3dd3d5ba2c71 100644 --- a/nomad/testing.go +++ b/nomad/testing.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/version" "github.com/pkg/errors" + "github.com/stretchr/testify/require" ) var ( @@ -39,6 +40,12 @@ func TestACLServer(t *testing.T, cb func(*Config)) (*Server, *structs.ACLToken, } func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) { + s, c, err := TestServerErr(t, cb) + require.NoError(t, err, "failed to start test server") + return s, c +} + +func TestServerErr(t *testing.T, cb func(*Config)) (*Server, func(), error) { // Setup the default settings config := DefaultConfig() @@ -137,10 +144,10 @@ func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) { case <-time.After(1 * time.Minute): t.Fatal("timed out while shutting down server") } - } + }, nil } else if i == 0 { freeport.Return(ports) - t.Fatalf("err: %v", err) + return nil, nil, err } else { if server != nil { _ = server.Shutdown() @@ -151,7 +158,7 @@ func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) { } } - return nil, nil + return nil, nil, nil } func TestJoin(t *testing.T, servers ...*Server) { From cb5443841177f4cde35e707d923a351c2f85434b Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 24 Mar 2022 11:44:21 -0700 Subject: [PATCH 11/16] core: add deprecated mvn tag to serf (#12327) Revert a small part of #11600 after @lgfa29 discovered it would break compatibility with Nomad <= v1.2! Nomad <= v1.2 expects the `vsn` tag to exist in Serf. It has always been `1`. It has no functional purpose. However it causes a parsing error if it is not set: https://github.com/hashicorp/nomad/blob/v1.2.6/nomad/util.go#L103-L108 This means Nomad servers at version v1.2 or older will not allow servers without this tag to join. The `mvn` minor version tag is also checked, but soft fails. I'm not setting that because I want as much of this cruft gone as possible. --- nomad/server.go | 1 + nomad/util.go | 39 ++++++++++++++++++++++++++------------- nomad/util_test.go | 3 +++ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/nomad/server.go b/nomad/server.go index d2da13203716..1eb67addc009 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1453,6 +1453,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( conf.Tags["region"] = s.config.Region conf.Tags["dc"] = s.config.Datacenter conf.Tags["build"] = s.config.Build + conf.Tags["vsn"] = deprecatedAPIMajorVersionStr // for Nomad <= v1.2 compat conf.Tags["raft_vsn"] = fmt.Sprintf("%d", s.config.RaftConfig.ProtocolVersion) conf.Tags["id"] = s.config.NodeID conf.Tags["rpc_addr"] = s.clientRpcAdvertise.(*net.TCPAddr).IP.String() // Address that clients will use to RPC to servers diff --git a/nomad/util.go b/nomad/util.go index 0b45dd4305e5..5ea347f2b6ba 100644 --- a/nomad/util.go +++ b/nomad/util.go @@ -15,6 +15,14 @@ import ( "github.com/hashicorp/serf/serf" ) +const ( + // Deprecated: Through Nomad v1.2 these values were configurable but + // functionally unused. We still need to advertise them in Serf for + // compatibility with v1.2 and earlier. + deprecatedAPIMajorVersion = 1 + deprecatedAPIMajorVersionStr = "1" +) + // MinVersionPlanNormalization is the minimum version to support the // normalization of Plan in SubmitPlan, and the denormalization raft log entry committed // in ApplyPlanResultsRequest @@ -43,6 +51,10 @@ type serverParts struct { RPCAddr net.Addr Status serf.MemberStatus NonVoter bool + + // Deprecated: Functionally unused but needs to always be set by 1 for + // compatibility with v1.2.x and earlier. + MajorVersion int } func (s *serverParts) String() string { @@ -113,19 +125,20 @@ func isNomadServer(m serf.Member) (bool, *serverParts) { addr := &net.TCPAddr{IP: m.Addr, Port: port} rpcAddr := &net.TCPAddr{IP: rpcIP, Port: port} parts := &serverParts{ - Name: m.Name, - ID: id, - Region: region, - Datacenter: datacenter, - Port: port, - Bootstrap: bootstrap, - Expect: expect, - Addr: addr, - RPCAddr: rpcAddr, - Build: *buildVersion, - RaftVersion: raftVsn, - Status: m.Status, - NonVoter: nonVoter, + Name: m.Name, + ID: id, + Region: region, + Datacenter: datacenter, + Port: port, + Bootstrap: bootstrap, + Expect: expect, + Addr: addr, + RPCAddr: rpcAddr, + Build: *buildVersion, + RaftVersion: raftVsn, + Status: m.Status, + NonVoter: nonVoter, + MajorVersion: deprecatedAPIMajorVersion, } return true, parts } diff --git a/nomad/util_test.go b/nomad/util_test.go index af7583fd0247..b46e4d3f8a92 100644 --- a/nomad/util_test.go +++ b/nomad/util_test.go @@ -24,6 +24,7 @@ func TestIsNomadServer(t *testing.T) { "dc": "east-aws", "rpc_addr": "1.1.1.1", "port": "10000", + "vsn": "1", "raft_vsn": "2", "build": "0.7.0+ent", "nonvoter": "1", @@ -52,6 +53,7 @@ func TestIsNomadServer(t *testing.T) { if parts.RPCAddr.String() != "1.1.1.1:10000" { t.Fatalf("bad: %v", parts.RPCAddr.String()) } + require.Equal(t, 1, parts.MajorVersion) if seg := parts.Build.Segments(); len(seg) != 3 { t.Fatalf("bad: %v", parts.Build) } else if seg[0] != 0 && seg[1] != 7 && seg[2] != 0 { @@ -201,6 +203,7 @@ func makeMember(version string, status serf.MemberStatus) serf.Member { "dc": "east-aws", "port": "10000", "build": version, + "vsn": "1", }, Status: status, } From 47b07f55954fd7ac6975525d5822b6f5cd09ccd9 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 24 Mar 2022 15:31:47 -0400 Subject: [PATCH 12/16] tests: fix rpc limit tests (#12364) --- nomad/rpc_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/nomad/rpc_test.go b/nomad/rpc_test.go index d7c0dc910b30..0ae94f0c7498 100644 --- a/nomad/rpc_test.go +++ b/nomad/rpc_test.go @@ -17,6 +17,7 @@ import ( "time" "github.com/hashicorp/go-msgpack/codec" + "github.com/hashicorp/go-sockaddr" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/ci" cstructs "github.com/hashicorp/nomad/client/structs" @@ -868,6 +869,15 @@ func TestRPC_Limits_OK(t *testing.T) { } c.RPCHandshakeTimeout = tc.timeout c.RPCMaxConnsPerClient = tc.limit + + // Bind the server to a private IP so that Autopilot's + // StatsFetcher requests come from a different IP than the test + // requests, otherwise they would interfere with the connection + // rate limiter since limits are imposed by IP address. + ip, err := sockaddr.GetPrivateIP() + require.NoError(t, err) + c.RPCAddr.IP = []byte(ip) + c.SerfConfig.MemberlistConfig.BindAddr = ip }) defer func() { cleanup() From 6dd1842560825f523d3f78410e6db09525804e0d Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Thu, 24 Mar 2022 14:58:43 -0500 Subject: [PATCH 13/16] ci: cleanup verbose mode and enable for gha test_checks.sh was removed in 2019 and now just breaks if VERBOSE is set when running tests via make targets in GHA, use verbose mode to display what tests are running --- .github/workflows/test-core.yaml | 1 + GNUmakefile | 10 ++-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-core.yaml b/.github/workflows/test-core.yaml index 672458ef3b9d..8ed7dc2b4ae5 100644 --- a/.github/workflows/test-core.yaml +++ b/.github/workflows/test-core.yaml @@ -20,6 +20,7 @@ on: - 'ui/*' - 'website/*' env: + VERBOSE: 1 GO_VERSION: 1.17.7 GOBIN: /usr/local/bin GOTESTARCH: amd64 diff --git a/GNUmakefile b/GNUmakefile index 64cd3856f460..39a986bea606 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -301,10 +301,7 @@ test-nomad: dev ## Run Nomad test suites -cover \ -timeout=20m \ -tags "$(GO_TAGS)" \ - $(GOTEST_PKGS) $(if $(VERBOSE), >test.log ; echo $$? > exit-code) - @if [ $(VERBOSE) ] ; then \ - bash -C "$(PROJECT_ROOT)/scripts/test_check.sh" ; \ - fi + $(GOTEST_PKGS) .PHONY: test-nomad-module test-nomad-module: dev ## Run Nomad test suites on a sub-module @@ -314,10 +311,7 @@ test-nomad-module: dev ## Run Nomad test suites on a sub-module -cover \ -timeout=20m \ -tags "$(GO_TAGS)" \ - ./... $(if $(VERBOSE), >test.log ; echo $$? > exit-code) - @if [ $(VERBOSE) ] ; then \ - bash -C "$(PROJECT_ROOT)/scripts/test_check.sh" ; \ - fi + ./... .PHONY: e2e-test e2e-test: dev ## Run the Nomad e2e test suite From 17347c5ff2b5eca382c3c1ea9fb2f2aa0e198159 Mon Sep 17 00:00:00 2001 From: Karthick Ramachandran Date: Thu, 24 Mar 2022 13:38:43 -0700 Subject: [PATCH 14/16] make stop job message clearer (#12252) --- .changelog/12252.txt | 3 +++ ui/app/templates/allocations/allocation/index.hbs | 12 ++++++------ .../templates/allocations/allocation/task/index.hbs | 4 ++-- ui/app/templates/clients/client/index.hbs | 2 +- .../components/job-page/parts/latest-deployment.hbs | 2 +- ui/app/templates/components/job-page/parts/title.hbs | 8 ++++---- ui/app/templates/components/job-version.hbs | 4 ++-- 7 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 .changelog/12252.txt diff --git a/.changelog/12252.txt b/.changelog/12252.txt new file mode 100644 index 000000000000..408f0fca6a0c --- /dev/null +++ b/.changelog/12252.txt @@ -0,0 +1,3 @@ +```release-note:improvement +ui: make buttons with confirmation more descriptive of their actions +``` diff --git a/ui/app/templates/allocations/allocation/index.hbs b/ui/app/templates/allocations/allocation/index.hbs index 0069a9fba85a..e463ad9eba62 100644 --- a/ui/app/templates/allocations/allocation/index.hbs +++ b/ui/app/templates/allocations/allocation/index.hbs @@ -42,9 +42,9 @@ @@ -28,9 +28,9 @@ diff --git a/ui/app/templates/components/job-version.hbs b/ui/app/templates/components/job-version.hbs index a7076d75f905..74d4308b90be 100644 --- a/ui/app/templates/components/job-version.hbs +++ b/ui/app/templates/components/job-version.hbs @@ -17,9 +17,9 @@ idleButton="is-warning is-outlined" confirmButton="is-warning"}} @alignRight={{true}} - @idleText="Revert" + @idleText="Revert Version" @cancelText="Cancel" - @confirmText="Yes, Revert" + @confirmText="Yes, Revert Version" @confirmationMessage="Are you sure you want to revert to this version?" @inlineText={{true}} @fadingBackground={{true}} From deedd790ce8060bfc16b4f8d21f056a8f7f5af69 Mon Sep 17 00:00:00 2001 From: dgotlieb Date: Thu, 24 Mar 2022 23:09:19 +0200 Subject: [PATCH 15/16] Add grpc and http2 listeners to gateway docs (#12367) Stating at Nomad version 1.2.0 `grpc` and `http2` [protocols are supported](https://github.com/hashicorp/nomad/pull/11187) --- website/content/docs/job-specification/gateway.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/content/docs/job-specification/gateway.mdx b/website/content/docs/job-specification/gateway.mdx index 50641bbd6bde..ef3474ffcb61 100644 --- a/website/content/docs/job-specification/gateway.mdx +++ b/website/content/docs/job-specification/gateway.mdx @@ -101,8 +101,8 @@ envoy_gateway_bind_addresses "" { #### `listener` Parameters - `port` `(int: required)` - The port that the listener should receive traffic on. -- `protocol` `(string: "tcp")` - The protocol associated with the listener. Either - `tcp` or `http`. +- `protocol` `(string: "tcp")` - The protocol associated with the listener. One + of `tcp`, `http`, `http2`, or `grpc`. ~> **Note:** If using `http`, preconfiguring a [service-default] in Consul to set the [Protocol](https://www.consul.io/docs/agent/config-entries/service-defaults#protocol) From 7edf1885c43dcafe746da34258e966935b208c23 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Fri, 25 Mar 2022 10:11:46 -0400 Subject: [PATCH 16/16] docs: fix link and add note about Nomad v1.3.0 on raft v3 upgrade (#12378) --- website/content/docs/upgrade/index.mdx | 30 +++++++++++++++----------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/website/content/docs/upgrade/index.mdx b/website/content/docs/upgrade/index.mdx index ec762aabc6cd..32fd137d741d 100644 --- a/website/content/docs/upgrade/index.mdx +++ b/website/content/docs/upgrade/index.mdx @@ -181,22 +181,24 @@ to be added. For production raft clusters with 3 or more memebrs, the easiest way to upgrade servers is to have each server leave the cluster, upgrade -its [`raft_protocol`] version in the `server` stanza, and then add it -back. Make sure the new server joins successfully and that the cluster -is stable before rolling the upgrade forward to the next server. It's -also possible to stand up a new set of servers, and then slowly stand -down each of the older servers in a similar fashion. +its [`raft_protocol`] version in the `server` stanza (if upgrading to +a version lower than v1.3.0), and then add it back. Make sure the new +server joins successfully and that the cluster is stable before +rolling the upgrade forward to the next server. It's also possible to +stand up a new set of servers, and then slowly stand down each of the +older servers in a similar fashion. For in-place raft protocol upgrades, perform the following for each server, leaving the leader until last to reduce the chance of leader elections that will slow down the process: -* Stop the server -* Run `nomad server force-leave $server_name` -* Update the `raft_protocol` in the server's configuration file to 3. -* Restart the server -* Run `nomad operator raft list-peers` to verify that the `raft_vsn` - for the server is now 3. +* Stop the server. +* Run `nomad server force-leave $server_name`. +* If the upgrade is for a Nomad version lower than v1.3.0, update the + [`raft_protocol`] in the server's configuration file to `3`. +* Restart the server. +* Run `nomad operator raft list-peers` to verify that the + `RaftProtocol` for the server is now `3`. * On the server, run `nomad agent-info` and check that the `last_log_index` is of a similar value to the other servers. This step ensures that raft is healthy and changes are replicating to the @@ -229,7 +231,9 @@ cat < "$NOMAD_DATA_DIR/server/raft/peers.json" EOF ``` -After running this script, update the `raft_protocol` in the server's -configuration to 3 and restart the server. +After running this script, if the upgrade is for a Nomad version lower +than v1.3.0, update the [`raft_protocol`] in the server's +configuration to `3` and restart the server. [peers-json]: https://learn.hashicorp.com/tutorials/nomad/outage-recovery#manual-recovery-using-peersjson +[`raft_protocol`]: /docs/configuration/server#raft_protocol