diff --git a/client/allocrunner/taskrunner/script_check_hook.go b/client/allocrunner/taskrunner/script_check_hook.go index fe2bd06a4d87..74efa24efca9 100644 --- a/client/allocrunner/taskrunner/script_check_hook.go +++ b/client/allocrunner/taskrunner/script_check_hook.go @@ -116,6 +116,7 @@ func (h *scriptCheckHook) Update(ctx context.Context, req *interfaces.TaskUpdate if task == nil { return fmt.Errorf("task %q not found in updated alloc", h.task.Name) } + h.alloc = req.Alloc h.task = task h.taskEnv = req.TaskEnv diff --git a/e2e/consul/consul.go b/e2e/consul/consul.go index 5809e2517742..e0b85c0e9e2f 100644 --- a/e2e/consul/consul.go +++ b/e2e/consul/consul.go @@ -23,6 +23,7 @@ func init() { Consul: true, Cases: []framework.TestCase{ new(ConsulE2ETest), + new(ScriptChecksE2ETest), }, }) } diff --git a/e2e/consul/input/checks_group.nomad b/e2e/consul/input/checks_group.nomad new file mode 100644 index 000000000000..aafbf2bbdc1c --- /dev/null +++ b/e2e/consul/input/checks_group.nomad @@ -0,0 +1,81 @@ +job "group_check" { + datacenters = ["dc1"] + type = "service" + + group "group_check" { + network { + mode = "bridge" + } + + service { + name = "group-service-1" + port = "9001" + + check { + name = "alive-1" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-1"] + } + } + + service { + name = "group-service-2" + port = "9002" + + check { + name = "alive-2a" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-2a"] + } + + # the file expected by this check will not exist when started, + # so the check will error-out and be in a warning state until + # it's been created + check { + name = "alive-2b" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "cat" + args = ["${NOMAD_TASK_DIR}/alive-2b"] + } + } + + service { + name = "group-service-3" + port = "9003" + + # this check should always time out and so the service + # should not be marked healthy + check { + name = "always-dead" + type = "script" + task = "test" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + } + } + + count = 1 + + task "test" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + } +} diff --git a/e2e/consul/input/checks_group_update.nomad b/e2e/consul/input/checks_group_update.nomad new file mode 100644 index 000000000000..f25cba7932a2 --- /dev/null +++ b/e2e/consul/input/checks_group_update.nomad @@ -0,0 +1,80 @@ +job "group_check" { + datacenters = ["dc1"] + type = "service" + + group "group_check" { + network { + mode = "bridge" + } + + service { + name = "group-service-1" + port = "9001" + + # after update, check name has changed + check { + name = "alive-1a" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-1a"] + } + } + + service { + name = "group-service-2" + port = "9002" + + check { + name = "alive-2a" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-2a"] + } + + # after updating, this check will always pass + check { + name = "alive-2b" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-2b"] + } + } + + service { + name = "group-service-3" + port = "9003" + + # this check should always time out and so the service + # should not be marked healthy + check { + name = "always-dead" + type = "script" + task = "test" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + } + } + + count = 1 + + task "test" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + } +} diff --git a/e2e/consul/input/checks_task.nomad b/e2e/consul/input/checks_task.nomad new file mode 100644 index 000000000000..3ec288822d34 --- /dev/null +++ b/e2e/consul/input/checks_task.nomad @@ -0,0 +1,70 @@ +job "task_check" { + datacenters = ["dc1"] + type = "service" + + group "task_check" { + count = 1 + + task "test" { + service { + name = "task-service-1" + + check { + name = "alive-1" + type = "script" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-1"] + } + } + + service { + name = "task-service-2" + + check { + name = "alive-2a" + type = "script" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-2a"] + } + + # the file expected by this check will not exist when started, + # so the check will error-out and be in a warning state until + # it's been created + check { + name = "alive-2b" + type = "script" + interval = "2s" + timeout = "2s" + command = "cat" + args = ["${NOMAD_TASK_DIR}/alive-2b"] + } + } + + service { + name = "task-service-3" + + # this check should always time out and so the service + # should not be marked healthy + check { + name = "always-dead" + type = "script" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + } + } + + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + } +} diff --git a/e2e/consul/input/checks_task_update.nomad b/e2e/consul/input/checks_task_update.nomad new file mode 100644 index 000000000000..506db884819b --- /dev/null +++ b/e2e/consul/input/checks_task_update.nomad @@ -0,0 +1,73 @@ +job "task_check" { + datacenters = ["dc1"] + type = "service" + + group "task_check" { + count = 1 + + task "test" { + service { + name = "task-service-1" + + # after update, check name has changed + check { + name = "alive-1a" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-1a"] + } + } + + service { + name = "task-service-2" + + check { + name = "alive-2a" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-2a"] + } + + # after updating, this check will always pass + check { + name = "alive-2b" + type = "script" + task = "test" + interval = "2s" + timeout = "2s" + command = "echo" + args = ["alive-2b"] + } + } + + service { + name = "task-service-3" + + # this check should always time out and so the service + # should not be marked healthy + check { + name = "always-dead" + type = "script" + task = "test" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + } + } + + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + } +} diff --git a/e2e/consul/script_checks.go b/e2e/consul/script_checks.go new file mode 100644 index 000000000000..ed4cb315a996 --- /dev/null +++ b/e2e/consul/script_checks.go @@ -0,0 +1,223 @@ +package consul + +import ( + "bytes" + "context" + "os" + "strings" + "time" + + capi "github.com/hashicorp/consul/api" + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/stretchr/testify/require" +) + +type ScriptChecksE2ETest struct { + framework.TC + jobIds []string +} + +func (tc *ScriptChecksE2ETest) BeforeAll(f *framework.F) { + // Ensure cluster has leader before running tests + e2eutil.WaitForLeader(f.T(), tc.Nomad()) + // Ensure that we have at least 1 client node in ready state + e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1) +} + +// requireStatus asserts the aggregate health of the service converges to +// the expected status +func requireStatus(require *require.Assertions, + consulClient *capi.Client, serviceName, expectedStatus string) { + require.Eventually(func() bool { + _, status := serviceStatus(require, consulClient, serviceName) + return status == expectedStatus + }, 30*time.Second, time.Second, // needs a long time for killing tasks/clients + "timed out expecting %q to become %q", + serviceName, expectedStatus, + ) +} + +// serviceStatus gets the aggregate health of the service and returns +// the []ServiceEntry for further checking +func serviceStatus(require *require.Assertions, + consulClient *capi.Client, serviceName string) ([]*capi.ServiceEntry, string) { + services, _, err := consulClient.Health().Service(serviceName, "", false, nil) + require.NoError(err, "expected no error for %q, got %v", serviceName, err) + if len(services) > 0 { + return services, services[0].Checks.AggregatedStatus() + } + return nil, "(unknown status)" +} + +// requireDeregistered asserts that the service eventually is deregistered from Consul +func requireDeregistered(require *require.Assertions, + consulClient *capi.Client, serviceName string) { + require.Eventually(func() bool { + services, _, err := consulClient.Health().Service(serviceName, "", false, nil) + require.NoError(err, "expected no error for %q, got %v", serviceName, err) + return len(services) == 0 + }, 5*time.Second, time.Second) +} + +// TestGroupScriptCheck runs a job with a single task group with several services +// and associated script checks. It updates, stops, etc. the job to verify +// that script checks are re-registered as expected. +func (tc *ScriptChecksE2ETest) TestGroupScriptCheck(f *framework.F) { + nomadClient := tc.Nomad() + uuid := uuid.Generate() + require := require.New(f.T()) + consulClient := tc.Consul() + + jobId := "checks_group" + uuid[0:8] + tc.jobIds = append(tc.jobIds, jobId) + + // Job run: verify that checks were registered in Consul + allocs := e2eutil.RegisterAndWaitForAllocs(f.T(), + nomadClient, "consul/input/checks_group.nomad", jobId) + require.Equal(1, len(allocs)) + requireStatus(require, consulClient, "group-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "group-service-2", capi.HealthWarning) + requireStatus(require, consulClient, "group-service-3", capi.HealthCritical) + + // Check in warning state becomes healthy after check passes + _, _, err := exec(nomadClient, allocs, + []string{"/bin/sh", "-c", "touch ${NOMAD_TASK_DIR}/alive-2b"}) + require.NoError(err) + requireStatus(require, consulClient, "group-service-2", capi.HealthPassing) + + // Job update: verify checks are re-registered in Consul + allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), + nomadClient, "consul/input/checks_group_update.nomad", jobId) + require.Equal(1, len(allocs)) + requireStatus(require, consulClient, "group-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "group-service-2", capi.HealthPassing) + requireStatus(require, consulClient, "group-service-3", capi.HealthCritical) + + // Verify we don't have any linger script checks running on the client + out, _, err := exec(nomadClient, allocs, []string{"pgrep", "sleep"}) + require.NoError(err) + running := strings.Split(strings.TrimSpace(out.String()), "\n") + require.LessOrEqual(len(running), 2) // task itself + 1 check == 2 + + // Clean job stop: verify that checks were deregistered in Consul + nomadClient.Jobs().Deregister(jobId, false, nil) // nomad job stop + requireDeregistered(require, consulClient, "group-service-1") + requireDeregistered(require, consulClient, "group-service-2") + requireDeregistered(require, consulClient, "group-service-3") + + // // Restore for next test + allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), + nomadClient, "consul/input/checks_group.nomad", jobId) + require.Equal(2, len(allocs)) + requireStatus(require, consulClient, "group-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "group-service-2", capi.HealthWarning) + requireStatus(require, consulClient, "group-service-3", capi.HealthCritical) + + // Crash a task: verify that checks become healthy again + _, _, err = exec(nomadClient, allocs, []string{"pkill", "sleep"}) + require.NoError(err) + requireStatus(require, consulClient, "group-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "group-service-2", capi.HealthWarning) + requireStatus(require, consulClient, "group-service-3", capi.HealthCritical) + + // TODO(tgross) ... + // Restart client: verify that checks are re-registered +} + +// TestTaskScriptCheck runs a job with a single task with several services +// and associated script checks. It updates, stops, etc. the job to verify +// that script checks are re-registered as expected. +func (tc *ScriptChecksE2ETest) TestTaskScriptCheck(f *framework.F) { + nomadClient := tc.Nomad() + uuid := uuid.Generate() + require := require.New(f.T()) + consulClient := tc.Consul() + + jobId := "checks_task" + uuid[0:8] + tc.jobIds = append(tc.jobIds, jobId) + + // Job run: verify that checks were registered in Consul + allocs := e2eutil.RegisterAndWaitForAllocs(f.T(), + nomadClient, "consul/input/checks_task.nomad", jobId) + require.Equal(1, len(allocs)) + requireStatus(require, consulClient, "task-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "task-service-2", capi.HealthWarning) + requireStatus(require, consulClient, "task-service-3", capi.HealthCritical) + + // Check in warning state becomes healthy after check passes + _, _, err := exec(nomadClient, allocs, + []string{"/bin/sh", "-c", "touch ${NOMAD_TASK_DIR}/alive-2b"}) + require.NoError(err) + requireStatus(require, consulClient, "task-service-2", capi.HealthPassing) + + // Job update: verify checks are re-registered in Consul + allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), + nomadClient, "consul/input/checks_task_update.nomad", jobId) + require.Equal(1, len(allocs)) + requireStatus(require, consulClient, "task-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "task-service-2", capi.HealthPassing) + requireStatus(require, consulClient, "task-service-3", capi.HealthCritical) + + // Verify we don't have any linger script checks running on the client + out, _, err := exec(nomadClient, allocs, []string{"pgrep", "sleep"}) + require.NoError(err) + running := strings.Split(strings.TrimSpace(out.String()), "\n") + require.LessOrEqual(len(running), 2) // task itself + 1 check == 2 + + // Clean job stop: verify that checks were deregistered in Consul + nomadClient.Jobs().Deregister(jobId, false, nil) // nomad job stop + requireDeregistered(require, consulClient, "task-service-1") + requireDeregistered(require, consulClient, "task-service-2") + requireDeregistered(require, consulClient, "task-service-3") + + // // Restore for next test + allocs = e2eutil.RegisterAndWaitForAllocs(f.T(), + nomadClient, "consul/input/checks_task.nomad", jobId) + require.Equal(2, len(allocs)) + requireStatus(require, consulClient, "task-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "task-service-2", capi.HealthWarning) + requireStatus(require, consulClient, "task-service-3", capi.HealthCritical) + + // Crash a task: verify that checks become healthy again + _, _, err = exec(nomadClient, allocs, []string{"pkill", "sleep"}) + require.NoError(err) + requireStatus(require, consulClient, "task-service-1", capi.HealthPassing) + requireStatus(require, consulClient, "task-service-2", capi.HealthWarning) + requireStatus(require, consulClient, "task-service-3", capi.HealthCritical) + + // TODO(tgross) ... + // Restart client: verify that checks are re-registered +} + +func (tc *ScriptChecksE2ETest) AfterEach(f *framework.F) { + nomadClient := tc.Nomad() + jobs := nomadClient.Jobs() + // Stop all jobs in test + for _, id := range tc.jobIds { + jobs.Deregister(id, true, nil) + } + // Garbage collect + nomadClient.System().GarbageCollect() +} + +func exec(client *api.Client, allocs []*api.AllocationListStub, command []string) (bytes.Buffer, bytes.Buffer, error) { + ctx, cancelFn := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelFn() + + var stdout, stderr bytes.Buffer + + alloc := &api.Allocation{ + ID: allocs[0].ID, + Namespace: allocs[0].Namespace, + NodeID: allocs[0].NodeID, + } + _, err := client.Allocations().Exec(ctx, + alloc, "test", false, + command, + os.Stdin, &stdout, &stderr, // os.Stdout, os.Stderr, + make(chan api.TerminalSize), nil) + return stdout, stderr, err +}