From 4c00f6aa78415906f6cdff0158b4972e82cdfb03 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 3 Aug 2023 14:58:30 -0400 Subject: [PATCH] scheduler: filter device instance IDs by constraints (#18141) When the scheduler assigns a device instance, it iterates over the feasible devices and then picks the first instance with availability. If the jobspec uses a constraint on device ID, this can lead to buggy/surprising behavior where the node's device matches the constraint but then the individual device instance does not. Add a second filter based on the `${device.ids}` constraint after selecting a node's device to ensure the device instance ID falls within the constraint as well. Fixes: #18112 --- .changelog/18141.txt | 3 ++ scheduler/device.go | 35 +++++++++++++++++++++++- scheduler/device_test.go | 59 ++++++++++++++++++++++++++++------------ 3 files changed, 78 insertions(+), 19 deletions(-) create mode 100644 .changelog/18141.txt diff --git a/.changelog/18141.txt b/.changelog/18141.txt new file mode 100644 index 000000000000..f91530ea7c75 --- /dev/null +++ b/.changelog/18141.txt @@ -0,0 +1,3 @@ +```release-note:bug +scheduler: Fixed a bug where device IDs were not correctly filtered in constraints +``` diff --git a/scheduler/device.go b/scheduler/device.go index b26168d50fc6..a2117dfcc63c 100644 --- a/scheduler/device.go +++ b/scheduler/device.go @@ -6,6 +6,7 @@ import ( "math" "github.com/hashicorp/nomad/nomad/structs" + psstructs "github.com/hashicorp/nomad/plugins/shared/structs" ) // deviceAllocator is used to allocate devices to allocations. The allocator @@ -112,7 +113,8 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc assigned := uint64(0) for id, v := range devInst.Instances { - if v == 0 && assigned < ask.Count { + if v == 0 && assigned < ask.Count && + d.deviceIDMatchesConstraint(id, ask.Constraints, devInst.Device) { assigned++ offer.DeviceIDs = append(offer.DeviceIDs, id) if assigned == ask.Count { @@ -129,3 +131,34 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc return offer, matchedWeights, nil } + +// deviceIDMatchesConstraint checks a device instance ID against the constraints +// to ensure we're only assigning instance IDs that match. This is a narrower +// check than nodeDeviceMatches because we've already asserted that the device +// matches and now need to filter by instance ID. +func (d *deviceAllocator) deviceIDMatchesConstraint(id string, constraints structs.Constraints, device *structs.NodeDeviceResource) bool { + + // There are no constraints to consider + if len(constraints) == 0 { + return true + } + + deviceID := psstructs.NewStringAttribute(id) + + for _, c := range constraints { + var target *psstructs.Attribute + if c.LTarget == "${device.ids}" { + target, _ = resolveDeviceTarget(c.RTarget, device) + } else if c.RTarget == "${device.ids}" { + target, _ = resolveDeviceTarget(c.LTarget, device) + } else { + continue + } + + if !checkAttributeConstraint(d.ctx, c.Operand, target, deviceID, true, true) { + return false + } + } + + return true +} diff --git a/scheduler/device_test.go b/scheduler/device_test.go index 7bf6319bc2a0..9a4a7f70aa38 100644 --- a/scheduler/device_test.go +++ b/scheduler/device_test.go @@ -8,6 +8,7 @@ import ( "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" psstructs "github.com/hashicorp/nomad/plugins/shared/structs" + "github.com/shoenig/test/must" "github.com/stretchr/testify/require" ) @@ -161,13 +162,16 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) { nvidia1 := n.NodeResources.Devices[1] cases := []struct { - Name string - Constraints []*structs.Constraint - ExpectedDevice *structs.NodeDeviceResource - NoPlacement bool + Name string + Note string + Constraints []*structs.Constraint + ExpectedDevice *structs.NodeDeviceResource + ExpectedDeviceIDs []string + NoPlacement bool }{ { Name: "gpu", + Note: "-gt", Constraints: []*structs.Constraint{ { LTarget: "${device.attr.cuda_cores}", @@ -175,10 +179,12 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) { RTarget: "4000", }, }, - ExpectedDevice: nvidia1, + ExpectedDevice: nvidia1, + ExpectedDeviceIDs: collectInstanceIDs(nvidia1), }, { Name: "gpu", + Note: "-lt", Constraints: []*structs.Constraint{ { LTarget: "${device.attr.cuda_cores}", @@ -186,7 +192,8 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) { RTarget: "4000", }, }, - ExpectedDevice: nvidia0, + ExpectedDevice: nvidia0, + ExpectedDeviceIDs: collectInstanceIDs(nvidia0), }, { Name: "nvidia/gpu", @@ -208,7 +215,8 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) { RTarget: "1.4 GHz", }, }, - ExpectedDevice: nvidia0, + ExpectedDevice: nvidia0, + ExpectedDeviceIDs: collectInstanceIDs(nvidia0), }, { Name: "intel/gpu", @@ -216,6 +224,7 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) { }, { Name: "nvidia/gpu", + Note: "-no-placement", Constraints: []*structs.Constraint{ { LTarget: "${device.attr.memory_bandwidth}", @@ -236,29 +245,43 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) { }, NoPlacement: true, }, + { + Name: "nvidia/gpu", + Note: "-contains-id", + Constraints: []*structs.Constraint{ + { + LTarget: "${device.ids}", + Operand: "set_contains", + RTarget: nvidia0.Instances[1].ID, + }, + }, + ExpectedDevice: nvidia0, + ExpectedDeviceIDs: []string{nvidia0.Instances[1].ID}, + }, } for _, c := range cases { - t.Run(c.Name, func(t *testing.T) { - require := require.New(t) + t.Run(c.Name+c.Note, func(t *testing.T) { _, ctx := testContext(t) d := newDeviceAllocator(ctx, n) - require.NotNil(d) + must.NotNil(t, d) // Build the request ask := deviceRequest(c.Name, 1, c.Constraints, nil) out, score, err := d.AssignDevice(ask) if c.NoPlacement { - require.Nil(out) + require.Nil(t, out) } else { - require.NotNil(out) - require.Zero(score) - require.NoError(err) - - // Check that we got the nvidia device - require.Len(out.DeviceIDs, 1) - require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0]) + must.NotNil(t, out) + must.Zero(t, score) + must.NoError(t, err) + + // Check that we got the right nvidia device instance, and + // specific device instance IDs if required + must.Len(t, 1, out.DeviceIDs) + must.SliceContains(t, collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0]) + must.SliceContainsSubset(t, c.ExpectedDeviceIDs, out.DeviceIDs) } }) }