From 1ba7a303566610363c0c36c87e7bc6bb492dfc93 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Fri, 19 Apr 2024 21:58:44 +0200 Subject: [PATCH] fix: mixin generation when cluster label is changed (#12613) Signed-off-by: QuentinBisson --- production/loki-mixin/alerts.libsonnet | 8 ++-- .../loki-canary-dashboard.libsonnet | 48 +++++++++---------- .../loki-mixin/dashboards/loki-logs.libsonnet | 6 +-- .../dashboards/loki-operational.libsonnet | 26 +++++++--- .../dashboards/loki-reads.libsonnet | 3 +- .../dashboards/loki-writes.libsonnet | 3 +- .../dashboards/recording-rules.libsonnet | 2 +- 7 files changed, 54 insertions(+), 42 deletions(-) diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 089ed7439da3a..5bff18e72c6e5 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -57,17 +57,17 @@ { alert: 'LokiTooManyCompactorsRunning', expr: ||| - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - |||, + sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1 + ||| % $._config.per_cluster_label, 'for': '5m', labels: { severity: 'warning', }, annotations: { summary: 'Loki deployment is running more than one compactor.', - description: ||| + description: std.strReplace(||| {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - |||, + |||, 'cluster', $._config.per_cluster_label), }, }, ], diff --git a/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet index 6539a34d77e4c..94e07deb236bb 100644 --- a/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet +++ b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet @@ -24,8 +24,8 @@ local grafana = import 'grafonnet/grafana.libsonnet'; // This logic is inherited from mimir-mixin. dashboard.dashboard('Canary') // We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality. - .addTemplate('cluster', 'loki_build_info', 'cluster') - .addTemplate('namespace', 'loki_build_info{cluster=~"$cluster"}', 'namespace') + .addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label) + .addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace') + { // This dashboard uses the new grid system in order to place panels (using gridPos). // Because of this we can't use the mixin's addRow() and addPanel(). @@ -33,7 +33,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; rows: null, // ugly hack, copy pasta the tag/link // code from the loki-mixin - tags: ['loki'], + tags: $._config.tags, links: [ { asDropdown: true, @@ -49,60 +49,60 @@ local grafana = import 'grafonnet/grafana.libsonnet'; panels: [ // grid row 1 dashboard.panel('Canary Entries Total') + - dashboard.newStatPanel('sum(count(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}))', unit='short') + + dashboard.newStatPanel('sum(count(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 0 } }, dashboard.panel('Canary Logs Total') + - dashboard.newStatPanel('sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 0 } }, dashboard.panel('Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 0 } }, dashboard.panel('Spotcheck Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 9, y: 0 } }, // grid row 2 dashboard.panel('Spotcheck Total') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 4 } }, dashboard.panel('Metric Test Error %') + - dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))) * 100') + + dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))) * 100') + { gridPos: { h: 4, w: 3, x: 3, y: 4 } }, dashboard.panel('Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 6, y: 4 } }, dashboard.panel('Spotcheck Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + + dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + { gridPos: { h: 4, w: 3, x: 9, y: 4 } }, // grid row 3 dashboard.panel('Metric Test Expected') + - dashboard.newStatPanel('sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 8 } }, dashboard.panel('Metric Test Actual') + - dashboard.newStatPanel('sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 8 } }, dashboard.panel('Websocket Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 8 } }, dashboard.panel('Websocket Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 9, y: 8 } }, // end of grid dashboard.panel('Log Write to read Latency Percentiles') + dashboard.queryPanel([ - 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p95', 'p50']) + { gridPos: { h: 6, w: 12, x: 12, y: 0 } }, @@ -115,7 +115,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; ).addTargets( [ grafana.prometheus.target( - 'sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', + 'sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', legendFormat='{{le}}', format='heatmap', ), @@ -125,24 +125,24 @@ local grafana = import 'grafonnet/grafana.libsonnet'; dashboard.panel('Spot Check Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p99', 'p95']) + { gridPos: { h: 6, w: 12, x: 0, y: 14 } }, dashboard.panel('Metric Test Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', ], ['p99', 'p95'],) + { gridPos: { h: 6, w: 12, x: 12, y: 14 } }, dashboard.panel('Spot Check Missing %') + - dashboard.queryPanel('topk(20, (sum by (cluster, pod) (increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod) (increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + + dashboard.queryPanel('topk(20, (sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + { gridPos: { h: 6, w: 12, x: 0, y: 20 } }, g.panel('Missing logs') + - g.queryPanel('topk(20,(sum by (cluster, pod)(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod)(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ cluster }} {{ pod }}') + + g.queryPanel('topk(20,(sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ ' + $._config.per_cluster_label + ' }} {{ pod }}') + { gridPos: { h: 6, w: 12, x: 12, y: 20 } }, ], diff --git a/production/loki-mixin/dashboards/loki-logs.libsonnet b/production/loki-mixin/dashboards/loki-logs.libsonnet index 9fd6eee589502..b28d74e943664 100644 --- a/production/loki-mixin/dashboards/loki-logs.libsonnet +++ b/production/loki-mixin/dashboards/loki-logs.libsonnet @@ -48,7 +48,6 @@ local template = import 'grafonnet/template.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, } + lokiLogs + $.dashboard('Loki / Logs', uid='logs') @@ -61,8 +60,9 @@ local template = import 'grafonnet/template.libsonnet'; p { targets: [ e { - expr: if dashboards['loki-logs.json'].showMultiCluster then super.expr - else std.strReplace(super.expr, $._config.per_cluster_label + '="$cluster", ', ''), + expr: if dashboards['loki-logs.json'].showMultiCluster + then std.strReplace(super.expr, 'cluster="$cluster"', $._config.per_cluster_label + '="$cluster"') + else std.strReplace(super.expr, 'cluster="$cluster", ', ''), } for e in p.targets ], diff --git a/production/loki-mixin/dashboards/loki-operational.libsonnet b/production/loki-mixin/dashboards/loki-operational.libsonnet index e8f5d98248746..e20d7dc2d5629 100644 --- a/production/loki-mixin/dashboards/loki-operational.libsonnet +++ b/production/loki-mixin/dashboards/loki-operational.libsonnet @@ -11,7 +11,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; showAnnotations:: true, showLinks:: true, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, hiddenRows:: [ 'Cassandra', @@ -62,7 +61,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; local replaceClusterMatchers(expr) = if dashboards['loki-operational.json'].showMultiCluster - then expr + // Replace the recording rules cluster label with the per-cluster label + then std.strReplace( + // Replace the cluster label for equality matchers with the per-cluster label + std.strReplace( + // Replace the cluster label for regex matchers with the per-cluster label + std.strReplace( + expr, + 'cluster=~"$cluster"', + $._config.per_cluster_label + '=~"$cluster"' + ), + 'cluster="$cluster"', + $._config.per_cluster_label + '="$cluster"' + ), + 'cluster_', + $._config.per_cluster_label + '_' + ) else std.strReplace( std.strReplace( @@ -143,7 +157,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; local replaceAllMatchers(expr) = - replaceMatchers(replaceClusterMatchers(expr)), + replaceMatchers(expr), local selectDatasource(ds) = if ds == null || ds == '' then ds @@ -179,7 +193,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(p, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in p.targets ] else [], @@ -188,7 +202,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(sp, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in sp.targets ] else [], @@ -197,7 +211,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(ssp, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in ssp.targets ] else [], diff --git a/production/loki-mixin/dashboards/loki-reads.libsonnet b/production/loki-mixin/dashboards/loki-reads.libsonnet index 4dea689b18901..9b82ee0820e49 100644 --- a/production/loki-mixin/dashboards/loki-reads.libsonnet +++ b/production/loki-mixin/dashboards/loki-reads.libsonnet @@ -31,10 +31,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, clusterMatchers:: if cfg.showMultiCluster then - [utils.selector.re(cfg.clusterLabel, '$cluster')] + [utils.selector.re($._config.per_cluster_label, '$cluster')] else [], diff --git a/production/loki-mixin/dashboards/loki-writes.libsonnet b/production/loki-mixin/dashboards/loki-writes.libsonnet index bedb9ca108256..8227cc3834929 100644 --- a/production/loki-mixin/dashboards/loki-writes.libsonnet +++ b/production/loki-mixin/dashboards/loki-writes.libsonnet @@ -9,10 +9,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, clusterMatchers:: if cfg.showMultiCluster then - [utils.selector.re(cfg.clusterLabel, '$cluster')] + [utils.selector.re($._config.per_cluster_label, '$cluster')] else [], diff --git a/production/loki-mixin/dashboards/recording-rules.libsonnet b/production/loki-mixin/dashboards/recording-rules.libsonnet index 2d943807c6485..46618da952dcb 100644 --- a/production/loki-mixin/dashboards/recording-rules.libsonnet +++ b/production/loki-mixin/dashboards/recording-rules.libsonnet @@ -7,7 +7,7 @@ local template = import 'grafonnet/template.libsonnet'; template.new( 'tenant', '$datasource', - 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{cluster="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', + 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{' + $._config.per_cluster_label + '="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', regex='/"([^"]+)"/', sort=1, includeAll=true,