diff --git a/e2e/consul/check_restart.go b/e2e/consul/check_restart.go new file mode 100644 index 000000000000..a59eb6c3375e --- /dev/null +++ b/e2e/consul/check_restart.go @@ -0,0 +1,106 @@ +package consul + +import ( + "fmt" + "os" + "reflect" + "regexp" + "strings" + "time" + + e2e "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/helper/uuid" +) + +const ns = "" + +type CheckRestartE2ETest struct { + framework.TC + jobIds []string +} + +func (tc *CheckRestartE2ETest) BeforeAll(f *framework.F) { + e2e.WaitForLeader(f.T(), tc.Nomad()) + e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) +} + +func (tc *CheckRestartE2ETest) AfterEach(f *framework.F) { + if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { + return + } + + for _, id := range tc.jobIds { + _, err := e2e.Command("nomad", "job", "stop", "-purge", id) + f.Assert().NoError(err) + } + tc.jobIds = []string{} + _, err := e2e.Command("nomad", "system", "gc") + f.Assert().NoError(err) +} + +// TestGroupCheckRestart runs a job with a group service that will never +// become healthy. Both tasks should be restarted up to the 'restart' limit. +func (tc *CheckRestartE2ETest) TestGroupCheckRestart(f *framework.F) { + + jobID := "test-group-check-restart-" + uuid.Generate()[0:8] + f.NoError(e2e.Register(jobID, "consul/input/checks_group_restart.nomad")) + tc.jobIds = append(tc.jobIds, jobID) + + var allocID string + + f.NoError( + e2e.WaitForAllocStatusComparison( + func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, + func(got []string) bool { return reflect.DeepEqual(got, []string{"failed"}) }, + &e2e.WaitConfig{Interval: time.Second * 10, Retries: 30}, + )) + + expected := "Exceeded allowed attempts 2 in interval 5m0s and mode is \"fail\"" + + out, err := e2e.Command("nomad", "alloc", "status", allocID) + f.NoError(err, "could not get allocation status") + f.Contains(out, expected, + fmt.Errorf("expected '%s', got\n%v", expected, out)) + + re := regexp.MustCompile(`Total Restarts += (.*)\n`) + match := re.FindAllStringSubmatch(out, -1) + for _, m := range match { + f.Equal("2", strings.TrimSpace(m[1]), + fmt.Errorf("expected exactly 2 restarts for both tasks, got:\n%v", out)) + } +} + +// TestTaskCheckRestart runs a job with a task service that will never become +// healthy. Only the failed task should be restarted up to the 'restart' +// limit. +func (tc *CheckRestartE2ETest) TestTaskCheckRestart(f *framework.F) { + + jobID := "test-task-check-restart-" + uuid.Generate()[0:8] + f.NoError(e2e.Register(jobID, "consul/input/checks_task_restart.nomad")) + tc.jobIds = append(tc.jobIds, jobID) + + var allocID string + + f.NoError( + e2e.WaitForAllocStatusComparison( + func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, + func(got []string) bool { return reflect.DeepEqual(got, []string{"failed"}) }, + &e2e.WaitConfig{Interval: time.Second * 10, Retries: 30}, + )) + + expected := "Exceeded allowed attempts 2 in interval 5m0s and mode is \"fail\"" + + out, err := e2e.Command("nomad", "alloc", "status", allocID) + f.NoError(err, "could not get allocation status") + f.Contains(out, expected, + fmt.Errorf("expected '%s', got\n%v", expected, out)) + + re := regexp.MustCompile(`Total Restarts += (.*)\n`) + match := re.FindAllStringSubmatch(out, -1) + f.Equal("2", strings.TrimSpace(match[0][1]), + fmt.Errorf("expected exactly 2 restarts for failed task, got:\n%v", out)) + + f.Equal("0", strings.TrimSpace(match[1][1]), + fmt.Errorf("expected exactly no restarts for healthy task, got:\n%v", out)) +} diff --git a/e2e/consul/consul.go b/e2e/consul/consul.go index 109db29cf817..20212480ffa1 100644 --- a/e2e/consul/consul.go +++ b/e2e/consul/consul.go @@ -35,6 +35,7 @@ func init() { Cases: []framework.TestCase{ new(ConsulE2ETest), new(ScriptChecksE2ETest), + new(CheckRestartE2ETest), }, }) } diff --git a/e2e/consul/input/checks_group_restart.nomad b/e2e/consul/input/checks_group_restart.nomad new file mode 100644 index 000000000000..0f520cbbc1ab --- /dev/null +++ b/e2e/consul/input/checks_group_restart.nomad @@ -0,0 +1,64 @@ +job "group_check_restart" { + datacenters = ["dc1"] + type = "service" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group_check_restart" { + network { + mode = "bridge" + } + + restart { + attempts = 2 + delay = "1s" + interval = "5m" + mode = "fail" + } + + service { + name = "group-service-1" + port = "9003" + + # this check should always time out and so the service + # should not be marked healthy, resulting in the tasks + # getting restarted + check { + name = "always-dead" + type = "script" + task = "fail" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + + check_restart { + limit = 2 + grace = "5s" + ignore_warnings = false + } + } + } + + task "fail" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + + task "ok" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + } +} diff --git a/e2e/consul/input/checks_task_restart.nomad b/e2e/consul/input/checks_task_restart.nomad new file mode 100644 index 000000000000..f36ed77b7379 --- /dev/null +++ b/e2e/consul/input/checks_task_restart.nomad @@ -0,0 +1,63 @@ +job "task_check" { + datacenters = ["dc1"] + type = "service" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "task_check" { + count = 1 + + restart { + attempts = 2 + delay = "1s" + interval = "5m" + mode = "fail" + } + + task "fail" { + + service { + name = "task-service-1" + + # this check should always time out and so the service + # should not be marked healthy + check { + name = "always-dead" + type = "script" + interval = "2s" + timeout = "1s" + command = "sleep" + args = ["10"] + + check_restart { + limit = 2 + grace = "5s" + ignore_warnings = false + } + + } + } + + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + + + task "ok" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "sleep 15000"] + } + } + + } +}