From 038bbb17e3fa49a31fc34ce23b77462d18581995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 2 Jun 2020 11:24:51 +0200 Subject: [PATCH 1/3] Update TSDB metrics after recent change on Cortex master. --- cortex-mixin/alerts/blocks.libsonnet | 4 ++-- cortex-mixin/dashboards/queries.libsonnet | 22 +++++++++++----------- cortex-mixin/dashboards/reads.libsonnet | 12 ++++++------ 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cortex-mixin/alerts/blocks.libsonnet b/cortex-mixin/alerts/blocks.libsonnet index 822a9bc3..94300db6 100644 --- a/cortex-mixin/alerts/blocks.libsonnet +++ b/cortex-mixin/alerts/blocks.libsonnet @@ -54,9 +54,9 @@ alert: 'CortexStoreGatewayHasNotSyncTheBucket', 'for': '5m', expr: ||| - (time() - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 60 * 30) + (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway",%s} > 60 * 30) and - cortex_storegateway_blocks_last_successful_sync_timestamp_seconds{%s} > 0 + cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway",%s} > 0 ||| % [$.namespace_matcher(''), $.namespace_matcher('')], labels: { severity: 'critical', diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index 91b039f7..1d44f8a4 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -129,18 +129,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Store-gateway - Blocks') .addPanel( $.panel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_blocks_queried_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data fetched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_fetched_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Data touched / sec') + - $.queryPanel('sum by(data_type) (rate(cortex_storegateway_bucket_store_series_data_touched_sum{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + $.stack + { yaxes: $.yaxes('ops') }, ) @@ -150,15 +150,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Series fetch duration (per request)') + - $.latencyPanel('cortex_storegateway_bucket_store_series_get_all_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), + $.latencyPanel('cortex_bucket_store_series_get_all_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( $.panel('Series merge duration (per request)') + - $.latencyPanel('cortex_storegateway_bucket_store_series_merge_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), + $.latencyPanel('cortex_bucket_store_series_merge_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( $.panel('Series returned (per request)') + - $.queryPanel('sum(rate(cortex_storegateway_bucket_store_series_result_series_sum{%s}[$__interval])) / sum(rate(cortex_storegateway_bucket_store_series_result_series_count{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), + $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) .addRowIf( @@ -166,20 +166,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Blocks currently loaded') + - $.queryPanel('cortex_storegateway_bucket_store_blocks_loaded{%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{instance}}') + $.queryPanel('cortex_bucket_store_blocks_loaded{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{instance}}') ) .addPanel( $.successFailurePanel( 'Blocks loaded / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_loads_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], - 'sum(rate(cortex_storegateway_bucket_store_block_load_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), + 'sum(rate(cortex_bucket_store_block_loads_total{component="store-gateway",%s}[$__interval])) - sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_bucket_store_block_load_failures_total{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) .addPanel( $.successFailurePanel( 'Blocks dropped / sec', - 'sum(rate(cortex_storegateway_bucket_store_block_drops_total{%s}[$__interval])) - sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], - 'sum(rate(cortex_storegateway_bucket_store_block_drop_failures_total{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), + 'sum(rate(cortex_bucket_store_block_drops_total{component="store-gateway",%s}[$__interval])) - sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], + 'sum(rate(cortex_bucket_store_block_drop_failures_total{component="store-gateway",%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), ) ) ), diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index e39ad6dc..79a8e18d 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -100,17 +100,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Blocks Storage - Index header') .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds_count{%s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('cortex_storegateway_blocks_index_cache_memcached_operation_duration_seconds', '{%s,operation="getmulti"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="store-gateway",name="index-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_hits_total{%s}[$__interval])) / sum by(item_type) (rate(cortex_storegateway_blocks_index_cache_requests_total{%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + + $.queryPanel('sum by(item_type) (rate(thanos_store_index_cache_hits_total{component="store-gateway",%s}[$__interval])) / sum by(item_type) (rate(thanos_store_index_cache_requests_total{component="store-gateway",%s}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') + { yaxes: $.yaxes('percentunit') }, ) ) @@ -119,17 +119,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Memcached - Blocks Storage - Chunks') .addPanel( $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(cortex_storegateway_thanos_memcached_operations_total{%s,name="chunks-cache"}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="store-gateway",name="chunks-cache"}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + $.stack + { yaxes: $.yaxes('ops') }, ) .addPanel( $.panel('Latency (getmulti)') + - $.latencyPanel('cortex_storegateway_thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",name="chunks-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) + $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="store-gateway",name="chunks-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( $.panel('Hit ratio') + - $.queryPanel('sum(rate(cortex_storegateway_thanos_cache_memcached_hits_total{%s,name="chunks-cache"}[$__interval])) / sum(rate(cortex_storegateway_thanos_cache_memcached_requests_total{%s,name="chunks-cache"}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'chunks') + + $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="store-gateway",name="chunks-cache"}[$__interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="store-gateway",name="chunks-cache"}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'chunks') + { yaxes: $.yaxes('percentunit') }, ) ) From 4de30d4901d0928f43468c6062e7762f0cc3df64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 2 Jun 2020 11:28:09 +0200 Subject: [PATCH 2/3] Fix operation name. --- cortex-mixin/dashboards/dashboard-utils.libsonnet | 4 ++-- cortex-mixin/dashboards/object-store.libsonnet | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index 2a9d7a44..800f0b7e 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -175,8 +175,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Op: ObjectSize') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="objectsize"}' % [$.namespaceMatcher(), component]), + $.panel('Op: Attributes') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( $.panel('Op: Exists') + diff --git a/cortex-mixin/dashboards/object-store.libsonnet b/cortex-mixin/dashboards/object-store.libsonnet index c0216023..3263446c 100644 --- a/cortex-mixin/dashboards/object-store.libsonnet +++ b/cortex-mixin/dashboards/object-store.libsonnet @@ -50,8 +50,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Op: ObjectSize') + - $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="objectsize"}' % $.namespaceMatcher()), + $.panel('Op: Attributes') + + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="attributes"}' % $.namespaceMatcher()), ) .addPanel( $.panel('Op: Upload') + From 916534e9a1404b675e2aaf7572fd831b6162d720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20S=CC=8Ctibrany=CC=81?= Date: Tue, 2 Jun 2020 14:01:43 +0200 Subject: [PATCH 3/3] Added metadata cache stats (per querier and store-gateway). --- .../dashboards/dashboard-utils.libsonnet | 27 +++++++++++++++++++ cortex-mixin/dashboards/reads.libsonnet | 27 +++++++------------ 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index 800f0b7e..291ae3f5 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -202,4 +202,31 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), + + thanosMemcachedCache(title, jobName, component, cacheName):: + super.row(title) + .addPanel( + $.panel('QPS') + + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="%s",name="%s"}[$__interval]))' % [$.jobMatcher(jobName), component, cacheName], '{{operation}}') + + $.stack + + { yaxes: $.yaxes('ops') }, + ) + .addPanel( + $.panel('Latency (getmulti)') + + $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="%s",name="%s"}' % [$.jobMatcher(jobName), component, cacheName]) + ) + .addPanel( + $.panel('Hit ratio') + + $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="%s",name="%s"}[$__interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="%s",name="%s"}[$__interval]))' % + [ + $.jobMatcher(jobName), + component, + cacheName, + $.jobMatcher(jobName), + component, + cacheName, + ], 'items') + + { yaxes: $.yaxes('percentunit') }, + ), + } diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 79a8e18d..8da8f934 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -97,7 +97,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Memcached - Blocks Storage - Index header') + $.row('Memcached – Blocks Storage – Index header (Store-gateway)') .addPanel( $.panel('QPS') + $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + @@ -116,22 +116,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRowIf( std.setMember('tsdb', $._config.storage_engine), - $.row('Memcached - Blocks Storage - Chunks') - .addPanel( - $.panel('QPS') + - $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="store-gateway",name="chunks-cache"}[$__interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') + - $.stack + - { yaxes: $.yaxes('ops') }, - ) - .addPanel( - $.panel('Latency (getmulti)') + - $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="store-gateway",name="chunks-cache"}' % $.jobMatcher($._config.job_names.store_gateway)) - ) - .addPanel( - $.panel('Hit ratio') + - $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="store-gateway",name="chunks-cache"}[$__interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="store-gateway",name="chunks-cache"}[$__interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'chunks') + - { yaxes: $.yaxes('percentunit') }, - ) + $.thanosMemcachedCache('Memcached – Blocks Storage – Chunks (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache') + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache') + ) + .addRowIf( + std.setMember('tsdb', $._config.storage_engine), + $.thanosMemcachedCache('Memcached – Blocks Storage – Metadada (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache') ) .addRowIf( std.setMember('chunks', $._config.storage_engine) &&