diff --git a/examples/prometheus-alerting-rules/alerts.yaml b/examples/prometheus-alerting-rules/alerts.yaml index ba80354da0..7171ea4ead 100644 --- a/examples/prometheus-alerting-rules/alerts.yaml +++ b/examples/prometheus-alerting-rules/alerts.yaml @@ -6,9 +6,9 @@ groups: description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. summary: kube-state-metrics is experiencing errors in list operations. expr: | - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m labels: @@ -18,9 +18,9 @@ groups: description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. summary: kube-state-metrics is experiencing errors in watch operations. expr: | - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m labels: @@ -30,7 +30,7 @@ groups: description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. summary: kube-state-metrics sharding is misconfigured. expr: | - stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 for: 15m labels: severity: critical @@ -39,9 +39,9 @@ groups: description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. summary: kube-state-metrics shards are missing. expr: | - 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 - - sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) != 0 for: 15m labels: diff --git a/jsonnet/kube-state-metrics-mixin/alerts.libsonnet b/jsonnet/kube-state-metrics-mixin/alerts.libsonnet index 396f910912..e378df18ee 100644 --- a/jsonnet/kube-state-metrics-mixin/alerts.libsonnet +++ b/jsonnet/kube-state-metrics-mixin/alerts.libsonnet @@ -7,9 +7,9 @@ { alert: 'KubeStateMetricsListErrors', expr: ||| - (sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) + (sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) by (%(clusterLabel)s) / - sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s}[5m]))) + sum(rate(kube_state_metrics_list_total{%(kubeStateMetricsSelector)s}[5m])) by (%(clusterLabel)s)) > 0.01 ||| % $._config, 'for': '15m', @@ -24,9 +24,9 @@ { alert: 'KubeStateMetricsWatchErrors', expr: ||| - (sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) + (sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s,result="error"}[5m])) by (%(clusterLabel)s) / - sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s}[5m]))) + sum(rate(kube_state_metrics_watch_total{%(kubeStateMetricsSelector)s}[5m])) by (%(clusterLabel)s)) > 0.01 ||| % $._config, 'for': '15m', @@ -42,7 +42,7 @@ alert: 'KubeStateMetricsShardingMismatch', // expr: ||| - stdvar (kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) != 0 + stdvar (kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) != 0 ||| % $._config, 'for': '15m', labels: { @@ -61,9 +61,9 @@ // A handy side effect of this computation is the result indicates what ordinals are missing. // Eg. a result of "5" decimal, which translates to binary "101", means shards #0 and #2 are not available. expr: ||| - 2^max(kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) - 1 + 2^max(kube_state_metrics_total_shards{%(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - 1 - - sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{%(kubeStateMetricsSelector)s}) ) + sum( 2 ^ max by (%(clusterLabel)s, shard_ordinal) (kube_state_metrics_shard_ordinal{%(kubeStateMetricsSelector)s}) ) by (%(clusterLabel)s) != 0 ||| % $._config, 'for': '15m', diff --git a/jsonnet/kube-state-metrics-mixin/config.libsonnet b/jsonnet/kube-state-metrics-mixin/config.libsonnet index 8a5402c693..f4e66f12f3 100644 --- a/jsonnet/kube-state-metrics-mixin/config.libsonnet +++ b/jsonnet/kube-state-metrics-mixin/config.libsonnet @@ -2,5 +2,6 @@ _config+:: { // Select the metrics coming from the kube state metrics. kubeStateMetricsSelector: 'job="kube-state-metrics"', + clusterLabel: 'cluster', }, }