Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Makes auto reverts robust against previously stable job that fail to start correctly upon reverting #3496

Merged
merged 8 commits into from
Nov 3, 2017
8 changes: 7 additions & 1 deletion nomad/deploymentwatcher/deployment_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,13 @@ func (w *deploymentWatcher) watch() {
// Description should include that the job is being rolled back to
// version N
if j != nil {
desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
// only revert if job being changed has a different spec
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Capitalize the start of your comments.
  2. Can we add more detail in the comment explaining how this can avoid an infinite revert cycle.

if w.j.SpecChanged(j) {
desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
} else {
desc = structs.DeploymentStatusDescriptionRollbackFailed(desc, j.Version, w.j.Version)
j = nil
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to do the same on SetAllocHealth and FailDeployment

} else {
desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
}
Expand Down
108 changes: 108 additions & 0 deletions nomad/deploymentwatcher/deployments_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,8 @@ func TestDeploymentWatcher_Watch(t *testing.T) {

// Upsert the job again to get a new version
j2 := j.Copy()
// Modify the job to make its specification different
j2.Meta["foo"] = "bar"
j2.Stable = false
assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

Expand Down Expand Up @@ -734,6 +736,112 @@ func TestDeploymentWatcher_Watch(t *testing.T) {
func(err error) { assert.Equal(0, len(w.watchers), "Should have no deployment") })
}

// Tests that the watcher fails rollback when the spec hasn't changed
func TestDeploymentWatcher_RollbackFailed(t *testing.T) {
t.Parallel()
assert := assert.New(t)
w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)

// Create a job, alloc, and a deployment
j := mock.Job()
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.AutoRevert = true
j.Stable = true
d := mock.Deployment()
d.JobID = j.ID
d.TaskGroups["web"].AutoRevert = true
a := mock.Alloc()
a.DeploymentID = d.ID
assert.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
assert.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
assert.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")

// Upsert the job again to get a new version
j2 := j.Copy()
// Modify the job to make its specification different
j2.Stable = false
assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

w.SetEnabled(true, m.state)
testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
func(err error) { assert.Equal(1, len(w.watchers), "Should have 1 deployment") })

// Assert that we will get a createEvaluation call only once. This will
// verify that the watcher is batching allocation changes
m1 := matchUpsertEvals([]string{d.ID})
m.On("UpsertEvals", mocker.MatchedBy(m1)).Return(nil).Once()

// Update the allocs health to healthy which should create an evaluation
for i := 0; i < 5; i++ {
req := &structs.ApplyDeploymentAllocHealthRequest{
DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
HealthyAllocationIDs: []string{a.ID},
},
}
assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req), "UpsertDeploymentAllocHealth")
}

// Wait for there to be one eval
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}

if l := len(evals); l != 1 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}

return true, nil
}, func(err error) {
t.Fatal(err)
})

// Assert that we get a call to UpsertDeploymentStatusUpdate with roll back failed as the status
c := &matchDeploymentStatusUpdateConfig{
DeploymentID: d.ID,
Status: structs.DeploymentStatusFailed,
StatusDescription: structs.DeploymentStatusDescriptionRollbackFailed(structs.DeploymentStatusDescriptionFailedAllocations, 0, 1),
JobVersion: nil,
Eval: true,
}
m2 := matchDeploymentStatusUpdateRequest(c)
m.On("UpdateDeploymentStatus", mocker.MatchedBy(m2)).Return(nil)

// Update the allocs health to unhealthy which will cause attempting a rollback,
// fail in that step, do status update and eval
req2 := &structs.ApplyDeploymentAllocHealthRequest{
DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
UnhealthyAllocationIDs: []string{a.ID},
},
}
assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req2), "UpsertDeploymentAllocHealth")

// Wait for there to be one eval
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}

if l := len(evals); l != 2 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}

return true, nil
}, func(err error) {
t.Fatal(err)
})

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to assert the job version hasn't changed?

m.AssertCalled(t, "UpsertEvals", mocker.MatchedBy(m1))

}

// Test evaluations are batched between watchers
func TestWatcher_BatchEvals(t *testing.T) {
t.Parallel()
Expand Down
6 changes: 6 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4322,6 +4322,12 @@ func DeploymentStatusDescriptionRollback(baseDescription string, jobVersion uint
return fmt.Sprintf("%s - rolling back to job version %d", baseDescription, jobVersion)
}

// DeploymentStatusDescriptionRollbackFailed is used to get the status description of
// a deployment when rolling back to an older job is not possible because it has the same specification
func DeploymentStatusDescriptionRollbackFailed(baseDescription string, jobVersion uint64, jobVersionOld uint64) string {
return fmt.Sprintf("%s - rolling back to job version %d failed because it had the same specification as job version %d ", baseDescription, jobVersion, jobVersionOld)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

" - not rolling back to stable job version %d as current job has same specification"? If we can get across the same message in less the better. Also want to avoid saying the roll back failed since it didn't, we just won't do it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I am not a fan of my wording, I like your suggestion, will fix

}

// DeploymentStatusDescriptionNoRollbackTarget is used to get the status description of
// a deployment when there is no target to rollback to but autorevet is desired.
func DeploymentStatusDescriptionNoRollbackTarget(baseDescription string) string {
Expand Down