Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perserve device ids in in-place alloc updates #7762

Merged
merged 3 commits into from
Apr 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ BUG FIXES:
* connect: Fixed a bug where some connect proxy fields would be dropped from 'job inspect' output [[GH-7397](https://github.com/hashicorp/nomad/issues/7397)]
* connect: Fixed a bug where an absent connect sidecar_service stanza would trigger panic [[GH-7683](https://github.com/hashicorp/nomad/pull/7683)]
* connect: Fixed bugs where some connect parameters would be ignored [[GH-7690](https://github.com/hashicorp/nomad/pull/7690)] [[GH-7684](https://github.com/hashicorp/nomad/pull/7684)]
* scheduler: Fixed a bug in managing allocated devices for a job allocation in in-place update scenarios [[GH-7762](https://github.com/hashicorp/nomad/issues/7762)]

## 0.11.0 (April 8, 2020)

Expand Down
17 changes: 15 additions & 2 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2072,6 +2072,15 @@ func TestServiceSched_JobModify_InPlace(t *testing.T) {
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
require.NoError(t, h.State.UpsertDeployment(h.NextIndex(), d))

taskName := job.TaskGroups[0].Tasks[0].Name

adr := structs.AllocatedDeviceResource{
Type: "gpu",
Vendor: "nvidia",
Name: "1080ti",
DeviceIDs: []string{uuid.Generate()},
}

// Create allocs that are part of the old deployment
var allocs []*structs.Allocation
for i := 0; i < 10; i++ {
Expand All @@ -2082,6 +2091,7 @@ func TestServiceSched_JobModify_InPlace(t *testing.T) {
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
alloc.DeploymentID = d.ID
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{Healthy: helper.BoolToPtr(true)}
alloc.AllocatedResources.Tasks[taskName].Devices = []*structs.AllocatedDeviceResource{&adr}
allocs = append(allocs, alloc)
}
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
Expand Down Expand Up @@ -2155,13 +2165,16 @@ func TestServiceSched_JobModify_InPlace(t *testing.T) {
}
h.AssertEvalStatus(t, structs.EvalStatusComplete)

// Verify the network did not change
// Verify the allocated networks and devices did not change
rp := structs.Port{Label: "admin", Value: 5000}
for _, alloc := range out {
for _, resources := range alloc.TaskResources {
for _, resources := range alloc.AllocatedResources.Tasks {
if resources.Networks[0].ReservedPorts[0] != rp {
t.Fatalf("bad: %#v", alloc)
}
if len(resources.Devices) == 0 || reflect.DeepEqual(resources.Devices[0], adr) {
t.Fatalf("bad devices has changed: %#v", alloc)
}
}
}

Expand Down
12 changes: 9 additions & 3 deletions scheduler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -614,22 +614,25 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
continue
}

// Restore the network offers from the existing allocation.
// Restore the network and device offers from the existing allocation.
// We do not allow network resources (reserved/dynamic ports)
// to be updated. This is guarded in taskUpdated, so we can
// safely restore those here.
for task, resources := range option.TaskResources {
var networks structs.Networks
var devices []*structs.AllocatedDeviceResource
if update.Alloc.AllocatedResources != nil {
if tr, ok := update.Alloc.AllocatedResources.Tasks[task]; ok {
networks = tr.Networks
devices = tr.Devices
}
} else if tr, ok := update.Alloc.TaskResources[task]; ok {
networks = tr.Networks
}

// Add thhe networks back
// Add the networks and devices back
resources.Networks = networks
resources.Devices = devices
}

// Create a shallow copy
Expand Down Expand Up @@ -892,22 +895,25 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy
return false, true, nil
}

// Restore the network offers from the existing allocation.
// Restore the network and device offers from the existing allocation.
// We do not allow network resources (reserved/dynamic ports)
// to be updated. This is guarded in taskUpdated, so we can
// safely restore those here.
for task, resources := range option.TaskResources {
var networks structs.Networks
var devices []*structs.AllocatedDeviceResource
if existing.AllocatedResources != nil {
if tr, ok := existing.AllocatedResources.Tasks[task]; ok {
networks = tr.Networks
devices = tr.Devices
}
} else if tr, ok := existing.TaskResources[task]; ok {
networks = tr.Networks
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In 0.11, existing.AllocatedResources should always be non-nil. The type of existing.TaskResources[task].Devices is different, so I didn't bother making the compatibility path.

}

// Add the networks back
resources.Networks = networks
resources.Devices = devices
}

// Create a shallow copy
Expand Down