From eebc52906c8cb3e2661a6b43365a79a16b78adb5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 10:52:44 +0200 Subject: [PATCH 1/2] Add CortexRolloutStuck alert Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 61 +++++++++++++++++++++ jsonnet/mimir-mixin/docs/playbooks.md | 10 ++++ 2 files changed, 71 insertions(+) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 3c407b4813a..74c4ff6cfa4 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -412,6 +412,67 @@ }, ], }, + { + name: 'cortex-rollout-alerts', + rules: [ + { + alert: 'CortexRolloutStuck', + expr: ||| + ( + max without (revision) ( + kube_statefulset_status_current_revision + unless + kube_statefulset_status_update_revision + ) + * + ( + kube_statefulset_replicas + != + kube_statefulset_status_replicas_updated + ) + ) and ( + changes(kube_statefulset_status_replicas_updated[15m]) + == + 0 + ) + * on(%s) group_left max by(%s) (cortex_build_info) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + The {{ $labels.statefulset }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, + }, + }, + { + alert: 'CortexRolloutStuck', + expr: ||| + ( + kube_deployment_spec_replicas + != + kube_deployment_status_replicas_updated + ) and ( + changes(kube_deployment_status_replicas_updated[15m]) + == + 0 + ) + * on(%s) group_left max by(%s) (cortex_build_info) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + The {{ $labels.deployment }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, + }, + }, + ], + }, { name: 'cortex-provisioning', rules: [ diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 98416a6d2d1..2393bc94a66 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -231,6 +231,16 @@ How to **investigate**: _If the alert `CortexIngesterTSDBHeadCompactionFailed` fired as well, then give priority to it because that could be the cause._ +### CortexRolloutStuck + +This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. + +How to **investigate**: +- Run `kubectl -n get pods -l name=` to get a list of running pods +- Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) +- Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) +- Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information + #### Ingester hit the disk capacity If the ingester hit the disk capacity, any attempt to append samples will fail. You should: From ea3274f377f52ca1b220cc757f0406ed0bd9531c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 11:25:07 +0200 Subject: [PATCH 2/2] Fixed playbook Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/docs/playbooks.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2393bc94a66..e61f24ff57e 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -231,16 +231,6 @@ How to **investigate**: _If the alert `CortexIngesterTSDBHeadCompactionFailed` fired as well, then give priority to it because that could be the cause._ -### CortexRolloutStuck - -This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. - -How to **investigate**: -- Run `kubectl -n get pods -l name=` to get a list of running pods -- Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) -- Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) -- Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information - #### Ingester hit the disk capacity If the ingester hit the disk capacity, any attempt to append samples will fail. You should: @@ -734,6 +724,15 @@ When an alertmanager cannot read the state for a tenant from storage it gets log - The state could not be merged because it might be invalid and could not be decoded. This could indicate data corruption and therefore a bug in the reading or writing of the state, and would need further investigation. - The state could not be read from storage. This could be due to a networking issue such as a timeout or an authentication and authorization issue with the remote object store. +### CortexRolloutStuck + +This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. + +How to **investigate**: +- Run `kubectl -n get pods -l name=` to get a list of running pods +- Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) +- Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) +- Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information ## Cortex routes by path