From 02c822477c1c88aa69364b0cd969fe31d28f14e0 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 2 Sep 2021 19:37:13 +0200 Subject: [PATCH 1/4] Add recording rules for speeding up Alertmanager dashboard. With large numbers of tenants the queries for some panels on thos dashboard can become quite slow as the metrics exposed are per-tenant. --- .../dashboards/alertmanager.libsonnet | 38 ++++++------ cortex-mixin/recording_rules.libsonnet | 61 +++++++++++++++++++ 2 files changed, 80 insertions(+), 19 deletions(-) diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 7e2e3c5..f735005 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -11,11 +11,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Total Alerts') + - $.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') ) .addPanel( $.panel('Total Silences') + - $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') ) .addPanel( $.panel('Tenants') + @@ -29,11 +29,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -46,11 +46,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) + sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) + sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -61,13 +61,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; [ ||| ( - sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) by(integration) + sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration) + sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration) ) > 0 or on () vector(0) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)' % $.jobMatcher('alertmanager'), + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'), ], ['success - {{ integration }}', 'failed - {{ integration }}'] ) @@ -104,7 +104,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -112,7 +112,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_state_replication_total{%s}:rate5m) - - sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_partial_state_merges_total{%s}:rate5m) - - sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 1ce2519..990f8c3 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -364,6 +364,67 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) |||, }, + // Aggregations of per-user Alertmanager metrics used in dashboards. + { + record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label, + expr: ||| + sum by (cluster, job, %s) (cortex_alertmanager_alerts) + ||| % $._config.per_instance_label, + }, + { + record: 'cluster_job_%s:cortex_alertmanager_silences:sum' % $._config.per_instance_label, + expr: ||| + sum by (cluster, job, %s) (cortex_alertmanager_silences) + ||| % $._config.per_instance_label, + }, + { + record: 'cluster_job:cortex_alertmanager_alerts_received_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + |||, + }, + { + record: 'cluster_job_integration:cortex_alertmanager_notifications_total:rate5m', + expr: ||| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + |||, + }, + { + record: 'cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m', + expr: ||| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_replication_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_persist_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_persist_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_persist_failed_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_persist_failed_total[5m])) + |||, + }, ], }, ], From 9f8700fbb943a47a7449fe3cf6a410ca59ffd31a Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 7 Sep 2021 16:07:27 +0200 Subject: [PATCH 2/4] Changelog. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 027037d..60e5cc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ * [ENHANCEMENT] Add support for Azure storage in Alertmanager configuration. #381 * [ENHANCEMENT] Add support for running Alertmanager in sharding mode. #394 * [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. #399 +* [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 From ad1a72a8c80099f61bcb554024a8909998b1c27c Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Wed, 15 Sep 2021 14:44:53 +0200 Subject: [PATCH 3/4] Fixes from testing. --- cortex-mixin/dashboards/alertmanager.libsonnet | 12 ++++++------ cortex-mixin/recording_rules.libsonnet | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index f735005..8897034 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job:cortex_alertmanager_state_replication_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s}) - - sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(cluster_job:cortex_alertmanager_partial_state_merges_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s}) - - sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m) + sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total{%s}:rate5m)' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 990f8c3..439f44d 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -414,15 +414,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; |||, }, { - record: 'cluster_job:cortex_alertmanager_state_persist_total:rate5m', + record: 'cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m', expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_state_persist_total[5m])) + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) |||, }, { - record: 'cluster_job:cortex_alertmanager_state_persist_failed_total:rate5m', + record: 'cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m', expr: ||| - sum by (cluster, job) (rate(cortex_alertmanager_state_persist_failed_total[5m])) + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) |||, }, ], From 826af4dfbd9bc5ca4b767763ec6f0f677170afcf Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Wed, 15 Sep 2021 14:53:34 +0200 Subject: [PATCH 4/4] Move rules to their own group. --- cortex-mixin/recording_rules.libsonnet | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 439f44d..a438cab 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -364,6 +364,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) |||, }, + ], + }, + { + name: 'cortex_alertmanager_rules', + rules: [ // Aggregations of per-user Alertmanager metrics used in dashboards. { record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label,