From c9ec23776bab9a51ee6560bee8f9a0a3b0ef406d Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Wed, 2 Jun 2021 14:48:11 -0400 Subject: [PATCH 1/8] refactor: config for job aggregation strings - to make it easier to override, define "cluster_namespace_job" in $._config as `job_aggregation_prefix`. - added some `job_aggregation_labels_*` as well The resulting output does not change (unless config is overridden). --- cortex-mixin/alerts/alerts.libsonnet | 2 +- cortex-mixin/config.libsonnet | 5 +++++ cortex-mixin/dashboards/writes.libsonnet | 13 ++++++++++--- cortex-mixin/recording_rules.libsonnet | 11 ++++++----- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 7568b4fd..d34777b2 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -39,7 +39,7 @@ { alert: 'CortexRequestLatency', expr: ||| - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} + %(job_aggregation_prefix)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > %(cortex_p99_latency_threshold_seconds)s ||| % $._config, diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 364e7a6e..7172ec18 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -38,6 +38,11 @@ compactor: 'compactor.*', // Match also custom compactor deployments. }, + // Aggregation strings related to "jobs" + job_aggregation_prefix: 'cluster_namespace_job', + job_aggregation_labels_recording_rules: 'cluster, namespace, job', + job_aggregation_labels_active_series: 'namespace', + // Labels used to in alert aggregations - should uniquely identify // a single Cortex cluster. alert_aggregation_labels: 'cluster, namespace', diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index eb3fcbc3..dc79edf7 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -12,17 +12,24 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Samples / s') + - $.statPanel('sum(cluster_namespace_job:cortex_distributor_received_samples:rate5m{%s})' % $.jobMatcher($._config.job_names.distributor), format='reqps') + $.statPanel( + 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { + job: $.jobMatcher($._config.job_names.distributor), + jobAggregationPrefix: $._config.job_aggregation_prefix + }, + format='reqps' + ) ) .addPanel( $.panel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} - / on(namespace) group_left - max by (namespace) (cortex_distributor_replication_factor{%(distributor)s})) + / on(%(labels)s) group_left + max by (%(labels)s) (cortex_distributor_replication_factor{%(distributor)s})) ||| % { ingester: $.jobMatcher($._config.job_names.ingester), distributor: $.jobMatcher($._config.job_names.distributor), + labels: $._config.job_aggregation_labels_active_series }, format='short') ) .addPanel( diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index f9804744..3cb21191 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -51,10 +51,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; name: 'cortex_received_samples', rules: [ { - record: 'cluster_namespace_job:cortex_distributor_received_samples:rate5m', + record: '%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m' % $._config, expr: ||| - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) - |||, + sum by (%(job_aggregation_labels_recording_rules)s) (rate(cortex_distributor_received_samples_total[5m])) + ||| % $._config, }, ], }, @@ -64,6 +64,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; max_samples_per_sec_per_ingester: 80e3, max_samples_per_sec_per_distributor: 240e3, limit_utilisation_target: 0.6, + job_aggregation_prefix: $._config.job_aggregation_prefix, }, name: 'cortex_scaling_rules', rules: [ @@ -89,7 +90,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m + %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m )[24h:] ) / %(max_samples_per_sec_per_distributor)s @@ -123,7 +124,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m + %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m )[24h:] ) * 3 / %(max_samples_per_sec_per_ingester)s From d613a91fd060f322c3ecb9157cf3058cbc143b95 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Wed, 2 Jun 2021 14:54:55 -0400 Subject: [PATCH 2/8] lint --- cortex-mixin/dashboards/writes.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index dc79edf7..6debc021 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -13,10 +13,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { + 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { job: $.jobMatcher($._config.job_names.distributor), - jobAggregationPrefix: $._config.job_aggregation_prefix - }, + jobAggregationPrefix: $._config.job_aggregation_prefix, + }, format='reqps' ) ) @@ -29,7 +29,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % { ingester: $.jobMatcher($._config.job_names.ingester), distributor: $.jobMatcher($._config.job_names.distributor), - labels: $._config.job_aggregation_labels_active_series + labels: $._config.job_aggregation_labels_active_series, }, format='short') ) .addPanel( From 36ac142d20c01614f47962891042d40c9d96718a Mon Sep 17 00:00:00 2001 From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com> Date: Thu, 3 Jun 2021 10:24:22 -0400 Subject: [PATCH 3/8] Update cortex-mixin/dashboards/writes.libsonnet simplify mapping by extending $._config Co-authored-by: Marco Pracucci --- cortex-mixin/dashboards/writes.libsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index 6debc021..e490db60 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -13,9 +13,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(jobAggregationPrefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % { + 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % $._config + { job: $.jobMatcher($._config.job_names.distributor), - jobAggregationPrefix: $._config.job_aggregation_prefix, }, format='reqps' ) From cf893978ef4245bdb2191ff2154fcf6ca58af6ae Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Thu, 3 Jun 2021 18:15:29 -0400 Subject: [PATCH 4/8] fix: syntax --- cortex-mixin/dashboards/writes.libsonnet | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index e490db60..dd524e5d 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -13,9 +13,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % $._config + { - job: $.jobMatcher($._config.job_names.distributor), - }, + 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( + $._config { + job: $.jobMatcher($._config.job_names.distributor), + } + ), format='reqps' ) ) From 31466e622c3a5380e4539623e8766dbed4e38b0e Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Fri, 4 Jun 2021 16:52:42 -0400 Subject: [PATCH 5/8] refactor: added a group_config defines group-related strings based off of array-based parameters in _config. deprecated _config.alert_aggregation_labels with a std.trace warning, while maintaining (temporary?) backward compatibility. --- cortex-mixin/alerts.libsonnet | 2 +- cortex-mixin/alerts/alerts.libsonnet | 2 +- cortex-mixin/config.libsonnet | 12 +++---- cortex-mixin/dashboards.libsonnet | 2 +- cortex-mixin/dashboards/writes.libsonnet | 9 +++-- cortex-mixin/groups.libsonnet | 45 ++++++++++++++++++++++++ cortex-mixin/mixin.libsonnet | 1 + cortex-mixin/recording_rules.libsonnet | 23 ++++++------ 8 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 cortex-mixin/groups.libsonnet diff --git a/cortex-mixin/alerts.libsonnet b/cortex-mixin/alerts.libsonnet index 9369a7da..771c62c8 100644 --- a/cortex-mixin/alerts.libsonnet +++ b/cortex-mixin/alerts.libsonnet @@ -8,5 +8,5 @@ (import 'alerts/compactor.libsonnet') else {}) + - { _config:: $._config }, + { _config:: $._config + $._group_config }, } diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index d34777b2..3605702d 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -39,7 +39,7 @@ { alert: 'CortexRequestLatency', expr: ||| - %(job_aggregation_prefix)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} + %(group_prefix_jobs)s_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > %(cortex_p99_latency_threshold_seconds)s ||| % $._config, diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 7172ec18..6322945c 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -38,14 +38,10 @@ compactor: 'compactor.*', // Match also custom compactor deployments. }, - // Aggregation strings related to "jobs" - job_aggregation_prefix: 'cluster_namespace_job', - job_aggregation_labels_recording_rules: 'cluster, namespace, job', - job_aggregation_labels_active_series: 'namespace', - - // Labels used to in alert aggregations - should uniquely identify - // a single Cortex cluster. - alert_aggregation_labels: 'cluster, namespace', + // Grouping labels, to uniquely identify and group by {jobs, clusters} + job_labels: ['cluster', 'namespace', 'job'], + cluster_labels: ['cluster', 'namespace'], + cortex_p99_latency_threshold_seconds: 2.5, // Whether resources dashboards are enabled (based on cAdvisor metrics). diff --git a/cortex-mixin/dashboards.libsonnet b/cortex-mixin/dashboards.libsonnet index baf800b3..9e7f71c2 100644 --- a/cortex-mixin/dashboards.libsonnet +++ b/cortex-mixin/dashboards.libsonnet @@ -31,5 +31,5 @@ (import 'dashboards/writes-resources.libsonnet') + (import 'dashboards/alertmanager-resources.libsonnet')) + - { _config:: $._config }, + { _config:: $._config + $._group_config }, } diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index dd524e5d..ea2ce3c3 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -13,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Samples / s') + $.statPanel( - 'sum(%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( + 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { job: $.jobMatcher($._config.job_names.distributor), } @@ -25,12 +25,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} - / on(%(labels)s) group_left - max by (%(labels)s) (cortex_distributor_replication_factor{%(distributor)s})) - ||| % { + / on(%(group_by_cluster)s) group_left + max by (%(group_by_cluster)s) (cortex_distributor_replication_factor{%(distributor)s})) + ||| % ($._config) { ingester: $.jobMatcher($._config.job_names.ingester), distributor: $.jobMatcher($._config.job_names.distributor), - labels: $._config.job_aggregation_labels_active_series, }, format='short') ) .addPanel( diff --git a/cortex-mixin/groups.libsonnet b/cortex-mixin/groups.libsonnet new file mode 100644 index 00000000..67813645 --- /dev/null +++ b/cortex-mixin/groups.libsonnet @@ -0,0 +1,45 @@ +{ + local makePrefix(groups) = std.join('_', groups), + local makeGroupBy(groups) = std.join(', ', groups), + + local group_by_cluster = makeGroupBy($._config.cluster_labels), + + _group_config+:: { + // Each group prefix is composed of `_`-separated labels + group_prefix_jobs: makePrefix($._config.job_labels), + group_prefix_clusters: makePrefix($._config.cluster_labels), + + // Each group-by label list is `, `-separated and unique identifies + group_by_job: makeGroupBy($._config.job_labels), + group_by_cluster: group_by_cluster, + }, + + // The following works around the deprecation of `$._config.alert_aggregation_labels` + // - If an override of that value is detected, a warning will be printed + // - If no override was detected, it will be set to the `group_by_cluster` value, + // which will replace it altogether in the future. + local alert_aggregation_labels_override = ( + { + alert_aggregation_labels: null, + } + super._config + ).alert_aggregation_labels, + + _config+:: { + alert_aggregation_labels: + if alert_aggregation_labels_override != null + then std.trace( + ||| + Deprecated: _config.alert_aggregation_labels + This field has been explicitly overridden to "%s". + Instead, express the override in terms of _config.cluster_labels. + E.g., cluster_labels: %s will automatically convert to "%s". + ||| % [ + alert_aggregation_labels_override, + $._config.cluster_labels, + group_by_cluster, + ], + alert_aggregation_labels_override + ) + else std.trace('All good with group by cluster', group_by_cluster), + }, +} diff --git a/cortex-mixin/mixin.libsonnet b/cortex-mixin/mixin.libsonnet index ed281b5b..bc04944c 100644 --- a/cortex-mixin/mixin.libsonnet +++ b/cortex-mixin/mixin.libsonnet @@ -1,4 +1,5 @@ (import 'config.libsonnet') + +(import 'groups.libsonnet') + (import 'dashboards.libsonnet') + (import 'alerts.libsonnet') + (import 'recording_rules.libsonnet') diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 3cb21191..6461ff47 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -1,6 +1,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; { + local _config = { + max_series_per_ingester: 1.5e6, + max_samples_per_sec_per_ingester: 80e3, + max_samples_per_sec_per_distributor: 240e3, + limit_utilisation_target: 0.6, + } + $._config + $._group_config, prometheusRules+:: { groups+: [ { @@ -51,21 +57,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; name: 'cortex_received_samples', rules: [ { - record: '%(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m' % $._config, + record: '%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m' % _config, expr: ||| - sum by (%(job_aggregation_labels_recording_rules)s) (rate(cortex_distributor_received_samples_total[5m])) - ||| % $._config, + sum by (%(group_by_job)s) (rate(cortex_distributor_received_samples_total[5m])) + ||| % _config, }, ], }, { - local _config = { - max_series_per_ingester: 1.5e6, - max_samples_per_sec_per_ingester: 80e3, - max_samples_per_sec_per_distributor: 240e3, - limit_utilisation_target: 0.6, - job_aggregation_prefix: $._config.job_aggregation_prefix, - }, name: 'cortex_scaling_rules', rules: [ { @@ -90,7 +89,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m + %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m )[24h:] ) / %(max_samples_per_sec_per_distributor)s @@ -124,7 +123,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( - %(job_aggregation_prefix)s:cortex_distributor_received_samples:rate5m + %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m )[24h:] ) * 3 / %(max_samples_per_sec_per_ingester)s From a8630f3a5974db395b1d79cad1dd704f04b90a52 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Fri, 4 Jun 2021 16:53:44 -0400 Subject: [PATCH 6/8] refactor: added a group_config defines group-related strings based off of array-based parameters in _config. deprecated _config.alert_aggregation_labels with a std.trace warning, while maintaining (temporary?) backward compatibility. --- cortex-mixin/groups.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/groups.libsonnet b/cortex-mixin/groups.libsonnet index 67813645..736c0962 100644 --- a/cortex-mixin/groups.libsonnet +++ b/cortex-mixin/groups.libsonnet @@ -40,6 +40,6 @@ ], alert_aggregation_labels_override ) - else std.trace('All good with group by cluster', group_by_cluster), + group_by_cluster, }, } From 6c3aa96ca16e9476f8f768b1c59280a58280e255 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Fri, 4 Jun 2021 16:56:48 -0400 Subject: [PATCH 7/8] refactor: added a group_config defines group-related strings based off of array-based parameters in _config. deprecated _config.alert_aggregation_labels with a std.trace warning, while maintaining (temporary?) backward compatibility. --- cortex-mixin/groups.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/groups.libsonnet b/cortex-mixin/groups.libsonnet index 736c0962..63076672 100644 --- a/cortex-mixin/groups.libsonnet +++ b/cortex-mixin/groups.libsonnet @@ -40,6 +40,6 @@ ], alert_aggregation_labels_override ) - group_by_cluster, + else group_by_cluster, }, } From a03451c0f96723d8df343cb7b5835011c8b3b256 Mon Sep 17 00:00:00 2001 From: Darren Janeczek Date: Tue, 8 Jun 2021 17:40:20 -0400 Subject: [PATCH 8/8] fix: changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc055f18..cb8f383e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ * [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. #315 * [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. #316 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 +* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319 +* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319 ## 1.9.0 / 2021-05-18