Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Makes auto reverts robust against previously stable job that fail to start correctly upon reverting #3496

Merged
merged 8 commits into from
Nov 3, 2017
92 changes: 91 additions & 1 deletion nomad/deployment_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ func TestDeploymentEndpoint_Fail_Rollback(t *testing.T) {
// Create the second job, deployment and alloc
j2 := j.Copy()
j2.Stable = false
// Modify the job to make its specification different
j2.Meta["foo"] = "bar"

d := mock.Deployment()
d.TaskGroups["web"].AutoRevert = true
Expand Down Expand Up @@ -792,7 +794,8 @@ func TestDeploymentEndpoint_SetAllocHealth_Rollback(t *testing.T) {
// Create the second job, deployment and alloc
j2 := j.Copy()
j2.Stable = false

// Modify the job to make its specification different
j2.Meta["foo"] = "bar"
d := mock.Deployment()
d.TaskGroups["web"].AutoRevert = true
d.JobID = j2.ID
Expand Down Expand Up @@ -857,6 +860,93 @@ func TestDeploymentEndpoint_SetAllocHealth_Rollback(t *testing.T) {
assert.EqualValues(2, jout.Version, "reverted job version")
}

// tests rollback upon alloc health failure to job with identical spec does not succeed
func TestDeploymentEndpoint_SetAllocHealth_NoRollback(t *testing.T) {
t.Parallel()
s1 := testServer(t, func(c *Config) {
c.NumSchedulers = 0 // Prevent automatic dequeue
})
defer s1.Shutdown()
codec := rpcClient(t, s1)
testutil.WaitForLeader(t, s1.RPC)
assert := assert.New(t)
state := s1.fsm.State()

// Create the original job
j := mock.Job()
j.Stable = true
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.AutoRevert = true
assert.Nil(state.UpsertJob(998, j), "UpsertJob")

// Create the second job, deployment and alloc. Job has same spec as original
j2 := j.Copy()
j2.Stable = false

d := mock.Deployment()
d.TaskGroups["web"].AutoRevert = true
d.JobID = j2.ID
d.JobVersion = j2.Version

a := mock.Alloc()
a.JobID = j.ID
a.DeploymentID = d.ID

assert.Nil(state.UpsertJob(999, j2), "UpsertJob")
assert.Nil(state.UpsertDeployment(1000, d), "UpsertDeployment")
assert.Nil(state.UpsertAllocs(1001, []*structs.Allocation{a}), "UpsertAllocs")

// Set the alloc as unhealthy
req := &structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
UnhealthyAllocationIDs: []string{a.ID},
WriteRequest: structs.WriteRequest{Region: "global"},
}

// Fetch the response
var resp structs.DeploymentUpdateResponse
assert.Nil(msgpackrpc.CallWithCodec(codec, "Deployment.SetAllocHealth", req, &resp), "RPC")
assert.NotZero(resp.Index, "bad response index")
assert.Nil(resp.RevertedJobVersion, "revert version must be nil")

// Lookup the evaluation
ws := memdb.NewWatchSet()
eval, err := state.EvalByID(ws, resp.EvalID)
assert.Nil(err, "EvalByID failed")
assert.NotNil(eval, "Expect eval")
assert.Equal(eval.CreateIndex, resp.EvalCreateIndex, "eval index mismatch")
assert.Equal(eval.TriggeredBy, structs.EvalTriggerDeploymentWatcher, "eval trigger")
assert.Equal(eval.JobID, d.JobID, "eval job id")
assert.Equal(eval.DeploymentID, d.ID, "eval deployment id")
assert.Equal(eval.Status, structs.EvalStatusPending, "eval status")

// Lookup the deployment
expectedDesc := structs.DeploymentStatusDescriptionRollbackNoop(structs.DeploymentStatusDescriptionFailedAllocations, 0)
dout, err := state.DeploymentByID(ws, d.ID)
assert.Nil(err, "DeploymentByID failed")
assert.Equal(dout.Status, structs.DeploymentStatusFailed, "wrong status")
assert.Equal(dout.StatusDescription, expectedDesc, "wrong status description")
assert.Equal(resp.DeploymentModifyIndex, dout.ModifyIndex, "wrong modify index")
assert.Len(dout.TaskGroups, 1, "should have one group")
assert.Contains(dout.TaskGroups, "web", "should have web group")
assert.Equal(1, dout.TaskGroups["web"].UnhealthyAllocs, "should have one healthy")

// Lookup the allocation
aout, err := state.AllocByID(ws, a.ID)
assert.Nil(err, "AllocByID")
assert.NotNil(aout, "alloc")
assert.NotNil(aout.DeploymentStatus, "alloc deployment status")
assert.NotNil(aout.DeploymentStatus.Healthy, "alloc deployment healthy")
assert.False(*aout.DeploymentStatus.Healthy, "alloc deployment healthy")

// Lookup the job, its version should not have changed
jout, err := state.JobByID(ws, j.Namespace, j.ID)
assert.Nil(err, "JobByID")
assert.NotNil(jout, "job")
assert.EqualValues(1, jout.Version, "original job version")
}

func TestDeploymentEndpoint_List(t *testing.T) {
t.Parallel()
s1 := testServer(t, nil)
Expand Down
21 changes: 18 additions & 3 deletions nomad/deploymentwatcher/deployment_watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ func (w *deploymentWatcher) SetAllocHealth(
}

if j != nil {
desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
j, desc = w.handleRollbackValidity(j, desc)
}
break
}
Expand Down Expand Up @@ -185,6 +185,21 @@ func (w *deploymentWatcher) SetAllocHealth(
return nil
}

// handleRollbackValidity checks if the job being rolled back to has the same spec as the existing job
// Returns a modified description and job accordingly.
func (w *deploymentWatcher) handleRollbackValidity(rollbackJob *structs.Job, desc string) (*structs.Job, string) {
// Only rollback if job being changed has a different spec.
// This prevents an infinite revert cycle when a previously stable version of the job fails to start up during a rollback
// If the job we are trying to rollback to is identical to the current job, we stop because the rollback will not succeed.
if w.j.SpecChanged(rollbackJob) {
desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
} else {
desc = structs.DeploymentStatusDescriptionRollbackNoop(desc, rollbackJob.Version)
rollbackJob = nil
}
return rollbackJob, desc
}

func (w *deploymentWatcher) PromoteDeployment(
req *structs.DeploymentPromoteRequest,
resp *structs.DeploymentUpdateResponse) error {
Expand Down Expand Up @@ -265,7 +280,7 @@ func (w *deploymentWatcher) FailDeployment(
}

if rollbackJob != nil {
desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
rollbackJob, desc = w.handleRollbackValidity(rollbackJob, desc)
} else {
desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
}
Expand Down Expand Up @@ -371,7 +386,7 @@ func (w *deploymentWatcher) watch() {
// Description should include that the job is being rolled back to
// version N
if j != nil {
desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
j, desc = w.handleRollbackValidity(j, desc)
} else {
desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
}
Expand Down
174 changes: 174 additions & 0 deletions nomad/deploymentwatcher/deployments_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,9 @@ func TestWatcher_SetAllocHealth_Unhealthy_Rollback(t *testing.T) {
// Upsert the job again to get a new version
j2 := j.Copy()
j2.Stable = false
// Modify the job to make its specification different
j2.Meta["foo"] = "bar"

assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

w.SetEnabled(true, m.state)
Expand Down Expand Up @@ -316,6 +319,66 @@ func TestWatcher_SetAllocHealth_Unhealthy_Rollback(t *testing.T) {
m.AssertNumberOfCalls(t, "UpdateDeploymentAllocHealth", 1)
}

// Test setting allocation unhealthy on job with identical spec and there should be no rollback
func TestWatcher_SetAllocHealth_Unhealthy_NoRollback(t *testing.T) {
t.Parallel()
assert := assert.New(t)
w, m := defaultTestDeploymentWatcher(t)

// Create a job, alloc, and a deployment
j := mock.Job()
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.AutoRevert = true
j.Stable = true
d := mock.Deployment()
d.JobID = j.ID
d.TaskGroups["web"].AutoRevert = true
a := mock.Alloc()
a.DeploymentID = d.ID
assert.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
assert.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
assert.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")

// Upsert the job again to get a new version
j2 := j.Copy()
j2.Stable = false

assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

w.SetEnabled(true, m.state)
testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
func(err error) { assert.Equal(1, len(w.watchers), "Should have 1 deployment") })

// Assert that we get a call to UpsertDeploymentAllocHealth
matchConfig := &matchDeploymentAllocHealthRequestConfig{
DeploymentID: d.ID,
Unhealthy: []string{a.ID},
Eval: true,
DeploymentUpdate: &structs.DeploymentStatusUpdate{
DeploymentID: d.ID,
Status: structs.DeploymentStatusFailed,
StatusDescription: structs.DeploymentStatusDescriptionFailedAllocations,
},
JobVersion: nil,
}
matcher := matchDeploymentAllocHealthRequest(matchConfig)
m.On("UpdateDeploymentAllocHealth", mocker.MatchedBy(matcher)).Return(nil)

// Call SetAllocHealth
req := &structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
UnhealthyAllocationIDs: []string{a.ID},
}
var resp structs.DeploymentUpdateResponse
err := w.SetAllocHealth(req, &resp)
assert.Nil(err, "SetAllocHealth")

testutil.WaitForResult(func() (bool, error) { return 0 == len(w.watchers), nil },
func(err error) { assert.Equal(0, len(w.watchers), "Should have no deployment") })
m.AssertNumberOfCalls(t, "UpdateDeploymentAllocHealth", 1)
}

// Test promoting a deployment
func TestWatcher_PromoteDeployment_HealthyCanaries(t *testing.T) {
t.Parallel()
Expand Down Expand Up @@ -639,6 +702,8 @@ func TestDeploymentWatcher_Watch(t *testing.T) {

// Upsert the job again to get a new version
j2 := j.Copy()
// Modify the job to make its specification different
j2.Meta["foo"] = "bar"
j2.Stable = false
assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

Expand Down Expand Up @@ -734,6 +799,115 @@ func TestDeploymentWatcher_Watch(t *testing.T) {
func(err error) { assert.Equal(0, len(w.watchers), "Should have no deployment") })
}

// Tests that the watcher fails rollback when the spec hasn't changed
func TestDeploymentWatcher_RollbackFailed(t *testing.T) {
t.Parallel()
assert := assert.New(t)
w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)

// Create a job, alloc, and a deployment
j := mock.Job()
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.MaxParallel = 2
j.TaskGroups[0].Update.AutoRevert = true
j.Stable = true
d := mock.Deployment()
d.JobID = j.ID
d.TaskGroups["web"].AutoRevert = true
a := mock.Alloc()
a.DeploymentID = d.ID
assert.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
assert.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
assert.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")

// Upsert the job again to get a new version
j2 := j.Copy()
// Modify the job to make its specification different
j2.Stable = false
assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

w.SetEnabled(true, m.state)
testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
func(err error) { assert.Equal(1, len(w.watchers), "Should have 1 deployment") })

// Assert that we will get a createEvaluation call only once. This will
// verify that the watcher is batching allocation changes
m1 := matchUpsertEvals([]string{d.ID})
m.On("UpsertEvals", mocker.MatchedBy(m1)).Return(nil).Once()

// Update the allocs health to healthy which should create an evaluation
for i := 0; i < 5; i++ {
req := &structs.ApplyDeploymentAllocHealthRequest{
DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
HealthyAllocationIDs: []string{a.ID},
},
}
assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req), "UpsertDeploymentAllocHealth")
}

// Wait for there to be one eval
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}

if l := len(evals); l != 1 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}

return true, nil
}, func(err error) {
t.Fatal(err)
})

// Assert that we get a call to UpsertDeploymentStatusUpdate with roll back failed as the status
c := &matchDeploymentStatusUpdateConfig{
DeploymentID: d.ID,
Status: structs.DeploymentStatusFailed,
StatusDescription: structs.DeploymentStatusDescriptionRollbackNoop(structs.DeploymentStatusDescriptionFailedAllocations, 0),
JobVersion: nil,
Eval: true,
}
m2 := matchDeploymentStatusUpdateRequest(c)
m.On("UpdateDeploymentStatus", mocker.MatchedBy(m2)).Return(nil)

// Update the allocs health to unhealthy which will cause attempting a rollback,
// fail in that step, do status update and eval
req2 := &structs.ApplyDeploymentAllocHealthRequest{
DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
DeploymentID: d.ID,
UnhealthyAllocationIDs: []string{a.ID},
},
}
assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req2), "UpsertDeploymentAllocHealth")

// Wait for there to be one eval
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}

if l := len(evals); l != 2 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}

return true, nil
}, func(err error) {
t.Fatal(err)
})

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need to assert the job version hasn't changed?

m.AssertCalled(t, "UpsertEvals", mocker.MatchedBy(m1))

// verify that the job version hasn't changed after upsert
m.state.JobByID(nil, structs.DefaultNamespace, j.ID)
assert.Equal(uint64(0), j.Version, "Expected job version 0 but got ", j.Version)
}

// Test evaluations are batched between watchers
func TestWatcher_BatchEvals(t *testing.T) {
t.Parallel()
Expand Down
6 changes: 6 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4322,6 +4322,12 @@ func DeploymentStatusDescriptionRollback(baseDescription string, jobVersion uint
return fmt.Sprintf("%s - rolling back to job version %d", baseDescription, jobVersion)
}

// DeploymentStatusDescriptionRollbackNoop is used to get the status description of
// a deployment when rolling back is not possible because it has the same specification
func DeploymentStatusDescriptionRollbackNoop(baseDescription string, jobVersion uint64) string {
return fmt.Sprintf("%s - not rolling back to stable job version %d as current job has same specification", baseDescription, jobVersion)
}

// DeploymentStatusDescriptionNoRollbackTarget is used to get the status description of
// a deployment when there is no target to rollback to but autorevet is desired.
func DeploymentStatusDescriptionNoRollbackTarget(baseDescription string) string {
Expand Down