diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e75ffcd69..c75cd0bc4b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2936](https://github.com/thanos-io/thanos/pull/2936) Compact: Fix ReplicaLabelRemover panic when replicaLabels are not specified. - [#2956](https://github.com/thanos-io/thanos/pull/2956) Store: Fix fetching of chunks bigger than 16000 bytes. - [#2970](https://github.com/thanos-io/thanos/pull/2970) Store: Upgrade minio-go/v7 to fix slowness when running on EKS. +- [#2929](https://github.com/thanos-io/thanos/pull/2929) Mixin: Fix expression for 'unhealthy sidecar' alert and also increase the timeout for 10 minutes. ### Added diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index cb40e2a9f71..386ec60d435 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -275,7 +275,7 @@ rules: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job) >= 600 labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 72c3279e490..35586fabeb6 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -258,7 +258,7 @@ groups: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job) >= 600 labels: severity: critical - name: thanos-store.rules diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 25df0414e46..47070457c3b 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -22,47 +22,35 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) eval_time: 2m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar"}' value: 43 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 42 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar"}' value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 0 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar"}' value: 0 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 300 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 300 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - labels: '{job="thanos-sidecar"}' + value: 600 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 360 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 360 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) >= 300 + - labels: '{job="thanos-sidecar"}' + value: 660 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) >= 600 eval_time: 12m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 720 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar"}' value: 720 alert_rule_test: - eval_time: 1m @@ -71,24 +59,27 @@ tests: alertname: ThanosSidecarUnhealthy - eval_time: 3m alertname: ThanosSidecarUnhealthy - - eval_time: 5m + - eval_time: 10m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' - - eval_time: 6m + message: 'Thanos Sidecar thanos-sidecar is unhealthy for 600 seconds.' + - eval_time: 11m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar is unhealthy for 660 seconds.' - eval_time: 12m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar is unhealthy for 720 seconds.' diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index c81e2ba0a91..009d8648777 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -27,7 +27,7 @@ message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.', }, expr: ||| - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job) >= 600 ||| % thanos.sidecar, labels: { severity: 'critical',