Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update deployment health on failed allocations only if health is unset #5645

Merged
merged 3 commits into from
May 3, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion client/allocrunner/alloc_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -553,8 +553,9 @@ func (ar *allocRunner) clientAlloc(taskStates map[string]*structs.TaskState) *st

// If we are part of a deployment and the task has failed, mark the
preetapan marked this conversation as resolved.
Show resolved Hide resolved
// alloc as unhealthy. This guards against the watcher not be started.
// If the health status is already set then terminal allocations should not
if a.ClientStatus == structs.AllocClientStatusFailed &&
alloc.DeploymentID != "" && !a.DeploymentStatus.IsUnhealthy() {
alloc.DeploymentID != "" && !a.DeploymentStatus.HasHealth() && !a.DeploymentStatus.IsUnhealthy() {
preetapan marked this conversation as resolved.
Show resolved Hide resolved
a.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(false),
}
Expand Down
52 changes: 51 additions & 1 deletion client/allocrunner/alloc_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ func TestAllocRunner_HandlesArtifactFailure(t *testing.T) {

// Test that alloc runner kills tasks in task group when another task fails
func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
alloc := mock.BatchAlloc()
alloc := mock.Alloc()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I modified this existing test to change it to use a service job rather than batch so that it will get the DeploymentStatus field. Could use feedback on whether this is the best way to test this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is asserting reasonable things, but I think it would have passed before?

Restoring a Failed alloc with a Healthy DeploymentStatus and asserting ar.AllocState().DeploymentStatus.IsHealthy() would have failed before but worked now I think.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@schmichael is there an existing test you could point me to as an example? I am not able to get this test's DeploymentStatus to become healthy through the mock driver.

tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0

Expand All @@ -792,6 +792,22 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
task.Config = map[string]interface{}{
"run_for": "10s",
}
// Set a service with check
task.Services = []*structs.Service{
{
Name: "fakservice",
PortLabel: "http",
Checks: []*structs.ServiceCheck{
{
Name: "fakecheck",
Type: structs.ServiceCheckScript,
Command: "true",
Interval: 30 * time.Second,
Timeout: 5 * time.Second,
},
},
},
}

task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
task2.Name = "task 2"
Expand All @@ -803,8 +819,38 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
alloc.AllocatedResources.Tasks[task.Name] = tr
alloc.AllocatedResources.Tasks[task2.Name] = tr

// Make the alloc be part of a deployment
alloc.DeploymentID = uuid.Generate()
alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
alloc.Job.TaskGroups[0].Update.MaxParallel = 1
alloc.Job.TaskGroups[0].Update.MinHealthyTime = 10 * time.Millisecond
alloc.Job.TaskGroups[0].Update.HealthyDeadline = 2 * time.Second

checkHealthy := &api.AgentCheck{
CheckID: uuid.Generate(),
Status: api.HealthPassing,
}

conf, cleanup := testAllocRunnerConfig(t, alloc)
defer cleanup()

consulClient := conf.Consul.(*cconsul.MockConsulServiceClient)
consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
return &consul.AllocRegistration{
Tasks: map[string]*consul.TaskRegistration{
task.Name: {
Services: map[string]*consul.ServiceRegistration{
"123": {
Service: &api.AgentService{Service: "fakeservice"},
Checks: []*api.AgentCheck{checkHealthy},
},
},
},
},
}, nil
}

ar, err := NewAllocRunner(conf)
require.NoError(t, err)
defer destroy(ar)
Expand Down Expand Up @@ -850,6 +896,10 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
return false, fmt.Errorf("task2 should have failed")
}

if !last.DeploymentStatus.HasHealth() {
return false, fmt.Errorf("Expected deployment health to be non nil")
}

return true, nil
}, func(err error) {
require.Fail(t, "err: %v", err)
Expand Down