Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: filter device instance IDs by constraints #18141

Merged
merged 1 commit into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/18141.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
scheduler: Fixed a bug where device IDs were not correctly filtered in constraints
```
35 changes: 34 additions & 1 deletion scheduler/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"math"

"github.com/hashicorp/nomad/nomad/structs"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
)

// deviceAllocator is used to allocate devices to allocations. The allocator
Expand Down Expand Up @@ -115,7 +116,8 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc

assigned := uint64(0)
for id, v := range devInst.Instances {
if v == 0 && assigned < ask.Count {
if v == 0 && assigned < ask.Count &&
d.deviceIDMatchesConstraint(id, ask.Constraints, devInst.Device) {
assigned++
offer.DeviceIDs = append(offer.DeviceIDs, id)
if assigned == ask.Count {
Expand All @@ -132,3 +134,34 @@ func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *struc

return offer, matchedWeights, nil
}

// deviceIDMatchesConstraint checks a device instance ID against the constraints
// to ensure we're only assigning instance IDs that match. This is a narrower
// check than nodeDeviceMatches because we've already asserted that the device
// matches and now need to filter by instance ID.
func (d *deviceAllocator) deviceIDMatchesConstraint(id string, constraints structs.Constraints, device *structs.NodeDeviceResource) bool {

// There are no constraints to consider
if len(constraints) == 0 {
return true
}

deviceID := psstructs.NewStringAttribute(id)

for _, c := range constraints {
var target *psstructs.Attribute
if c.LTarget == "${device.ids}" {
target, _ = resolveDeviceTarget(c.RTarget, device)
} else if c.RTarget == "${device.ids}" {
target, _ = resolveDeviceTarget(c.LTarget, device)
} else {
continue
}

if !checkAttributeConstraint(d.ctx, c.Operand, target, deviceID, true, true) {
return false
}
}

return true
}
59 changes: 41 additions & 18 deletions scheduler/device_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -164,32 +165,38 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
nvidia1 := n.NodeResources.Devices[1]

cases := []struct {
Name string
Constraints []*structs.Constraint
ExpectedDevice *structs.NodeDeviceResource
NoPlacement bool
Name string
Note string
Constraints []*structs.Constraint
ExpectedDevice *structs.NodeDeviceResource
ExpectedDeviceIDs []string
NoPlacement bool
}{
{
Name: "gpu",
Note: "-gt",
Constraints: []*structs.Constraint{
{
LTarget: "${device.attr.cuda_cores}",
Operand: ">",
RTarget: "4000",
},
},
ExpectedDevice: nvidia1,
ExpectedDevice: nvidia1,
ExpectedDeviceIDs: collectInstanceIDs(nvidia1),
},
{
Name: "gpu",
Note: "-lt",
Constraints: []*structs.Constraint{
{
LTarget: "${device.attr.cuda_cores}",
Operand: "<",
RTarget: "4000",
},
},
ExpectedDevice: nvidia0,
ExpectedDevice: nvidia0,
ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
},
{
Name: "nvidia/gpu",
Expand All @@ -211,14 +218,16 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
RTarget: "1.4 GHz",
},
},
ExpectedDevice: nvidia0,
ExpectedDevice: nvidia0,
ExpectedDeviceIDs: collectInstanceIDs(nvidia0),
},
{
Name: "intel/gpu",
NoPlacement: true,
},
{
Name: "nvidia/gpu",
Note: "-no-placement",
Constraints: []*structs.Constraint{
{
LTarget: "${device.attr.memory_bandwidth}",
Expand All @@ -239,29 +248,43 @@ func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
},
NoPlacement: true,
},
{
Name: "nvidia/gpu",
Note: "-contains-id",
Constraints: []*structs.Constraint{
{
LTarget: "${device.ids}",
Operand: "set_contains",
RTarget: nvidia0.Instances[1].ID,
},
},
ExpectedDevice: nvidia0,
ExpectedDeviceIDs: []string{nvidia0.Instances[1].ID},
},
}

for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
require := require.New(t)
t.Run(c.Name+c.Note, func(t *testing.T) {
_, ctx := testContext(t)
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
must.NotNil(t, d)

// Build the request
ask := deviceRequest(c.Name, 1, c.Constraints, nil)

out, score, err := d.AssignDevice(ask)
if c.NoPlacement {
require.Nil(out)
require.Nil(t, out)
} else {
require.NotNil(out)
require.Zero(score)
require.NoError(err)

// Check that we got the nvidia device
require.Len(out.DeviceIDs, 1)
require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
must.NotNil(t, out)
must.Zero(t, score)
must.NoError(t, err)

// Check that we got the right nvidia device instance, and
// specific device instance IDs if required
must.Len(t, 1, out.DeviceIDs)
must.SliceContains(t, collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
must.SliceContainsSubset(t, c.ExpectedDeviceIDs, out.DeviceIDs)
}
})
}
Expand Down