From 9c8de7bd958aab40b083519671aee28424e232a8 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 16:44:27 -0700 Subject: [PATCH] Move check_restart to its own section. --- .../job-specification/check_restart.html.md | 151 ++++++++++++++++++ .../docs/job-specification/service.html.md | 69 +------- website/source/layouts/docs.erb | 3 + 3 files changed, 157 insertions(+), 66 deletions(-) create mode 100644 website/source/docs/job-specification/check_restart.html.md diff --git a/website/source/docs/job-specification/check_restart.html.md b/website/source/docs/job-specification/check_restart.html.md new file mode 100644 index 000000000000..344f8e805797 --- /dev/null +++ b/website/source/docs/job-specification/check_restart.html.md @@ -0,0 +1,151 @@ +--- +layout: "docs" +page_title: "check_restart Stanza - Job Specification" +sidebar_current: "docs-job-specification-check_restart" +description: |- + The "check_restart" stanza instructs Nomad when to restart tasks with + unhealthy service checks. +--- + +# `check_restart` Stanza + + + + + + + + + + +
Placement + job -> group -> task -> service -> **check_restart** +
Placement + job -> group -> task -> service -> check -> **check_restart** +
+ +As of Nomad 0.7 the `check_restart` stanza instructs Nomad when to restart +tasks with unhealthy service checks. When a health check in Consul has been +unhealthy for the `limit` specified in a `check_restart` stanza, it is +restarted according to the task group's [`restart` policy][restart_stanza]. The +`check_restart` settings apply to [`check`s][check_stanza], but may also be +placed on [`service`s][service_stanza] to apply to all checks on a service. + +```hcl +job "mysql" { + group "mysqld" { + + restart { + attempts = 3 + delay = "10s" + interval = "10m" + mode = "fail" + } + + task "server" { + service { + tags = ["leader", "mysql"] + + port = "db" + + check { + type = "tcp" + port = "db" + interval = "10s" + timeout = "2s" + } + + check { + type = "script" + name = "check_table" + command = "/usr/local/bin/check_mysql_table_status" + args = ["--verbose"] + interval = "60s" + timeout = "5s" + + check_restart { + limit = 3 + grace = "90s" + + ignore_warnings = false + } + } + } + } + } +} +``` + +- `limit` `(int: 0)` - Restart task when a health check has failed `limit` + times. For example 1 causes a restart on the first failure. The default, + `0`, disables healtcheck based restarts. Failures must be consecutive. A + single passing check will reset the count, so flapping services may not be + restarted. + +- `grace` `(string: "1s")` - Duration to wait after a task starts or restarts + before checking its health. + +- `ignore_warnings` `(bool: false)` - By default checks with both `critical` + and `warning` statuses are considered unhealthy. Setting `ignore_warnings = + true` treats a `warning` status like `passing` and will not trigger a restart. + +## Example Behavior + +Using the example `mysql` above would have the following behavior: + +```hcl +check_restart { + # ... + grace = "90s" + # ... +} +``` + +When the `server` task first starts and is registered in Consul, its health +will not be checked for 90 seconds. This gives the server time to startup. + +```hcl +check_restart { + limit = 3 + # ... +} +``` + +After the grace period if the script check fails, it has 180 seconds (`60s +interval * 3 limit`) to pass before a restart is triggered. Once a restart is +triggered the task group's [`restart` policy][restart_stanza] takes control: + +```hcl +restart { + # ... + delay = "10s" + # ... +} +``` + +The [`restart` stanza][restart_stanza] controls the restart behavior of the +task. In this case it will wait 10 seconds before restarting. Note that even if +the check passes in this time the restart will still occur. + +Once the task restarts Nomad waits the `grace` period again before starting to +check the task's health. + + +```hcl +restart { + attempts = 3 + # ... + interval = "10m" + mode = "fail" +} +``` + +If the check continues to fail, the task will be restarted up to `attempts` +times within an `interval`. If the `restart` attempts are reached within the +`limit` then the `mode` controls the behavior. In this case the task would fail +and not be restarted again. See the [`restart` stanza][restart_stanza] for +details. + +[check_stanza]: /docs/job-specification/service.html#check-parameters "check stanza" +[restart_stanza]: /docs/job-specification/restart.html "restart stanza" +[service_stanza]: /docs/job-specification/service.html "service stanza" diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index 2cee91b9d0e5..5d4fc6775853 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -117,6 +117,8 @@ scripts. - `args` `(array: [])` - Specifies additional arguments to the `command`. This only applies to script-based health checks. +- `check_restart` - See [`check_restart` stanza][check_restart_stanza]. + - `command` `(string: )` - Specifies the command to run for performing the health check. The script must exit: 0 for passing, 1 for warning, or any other value for a failing health check. This is required for script-based @@ -168,72 +170,6 @@ scripts. - `tls_skip_verify` `(bool: false)` - Skip verifying TLS certificates for HTTPS checks. Requires Consul >= 0.7.2. -#### `check_restart` Stanza - -As of Nomad 0.7 `check` stanzas may include a `check_restart` stanza to restart -tasks with unhealthy checks. Restarts use the parameters from the -[`restart`][restart_stanza] stanza, so if a task group has the default `15s` -delay, tasks won't be restarted for an extra 15 seconds after the -`check_restart` block considers it failed. `check_restart` stanzas have the -follow parameters: - -- `limit` `(int: 0)` - Restart task after `limit` failing health checks. For - example 1 causes a restart on the first failure. The default, `0`, disables - healtcheck based restarts. Failures must be consecutive. A single passing - check will reset the count, so flapping services may not be restarted. - -- `grace` `(string: "1s")` - Duration to wait after a task starts or restarts - before checking its health. On restarts the `delay` and max jitter is added - to the grace period to prevent checking a task's health before it has - restarted. - -- `ignore_warnings` `(bool: false)` - By default checks with both `critical` - and `warning` statuses are considered unhealthy. Setting `ignore_warnings = - true` treats a `warning` status like `passing` and will not trigger a restart. - -For example: - -```hcl -restart { - delay = "8s" -} - -task "mysqld" { - service { - # ... - check { - type = "script" - name = "check_table" - command = "/usr/local/bin/check_mysql_table_status" - args = ["--verbose"] - interval = "20s" - timeout = "5s" - - check_restart { - # Restart the task after 3 consecutive failed checks (180s) - limit = 3 - - # Ignore failed checks for 90s after a service starts or restarts - grace = "90s" - - # Treat warnings as unhealthy (the default) - ignore_warnings = false - } - } - } -} -``` - -In this example the `mysqld` task has `90s` from startup to begin passing -healthchecks. After the grace period if `mysqld` would remain unhealthy for -`60s` (as determined by `limit * interval`) it would be restarted after `8s` -(as determined by the `restart.delay`). Nomad would then wait `100s` (as -determined by `grace + delay + (delay * 0.25)`) before checking `mysqld`'s -health again. - -~> `check_restart` stanzas may also be placed in `service` stanzas to apply the - same restart logic to multiple checks. - #### `header` Stanza HTTP checks may include a `header` stanza to set HTTP headers. The `header` @@ -388,6 +324,7 @@ service { [qemu driver][qemu] since the Nomad client does not have access to the file system of a task for that driver. +[check_restart_stanza]: /docs/job-specification/check_restart.html "check_restart stanza" [service-discovery]: /docs/service-discovery/index.html "Nomad Service Discovery" [interpolation]: /docs/runtime/interpolation.html "Nomad Runtime Interpolation" [network]: /docs/job-specification/network.html "Nomad network Job Specification" diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb index cb98350fdc52..3cbe604f91f3 100644 --- a/website/source/layouts/docs.erb +++ b/website/source/layouts/docs.erb @@ -26,6 +26,9 @@ > artifact + > + check_restart + > constraint