diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index ef8823f05f..5efd2809b9 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -225,7 +225,7 @@ rules: {{ $value }} seconds for the bucket operations. expr: | ( - histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 15 + histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 ) @@ -336,7 +336,7 @@ rules: }} seconds for instant queries. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 90 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0 ) @@ -461,7 +461,7 @@ rules: $value }} seconds for the replicate operations. expr: | ( - histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 120 + histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0 ) diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 1d50297f92..bdb7abcb16 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -125,7 +125,7 @@ groups: }} seconds for instant queries. expr: | ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 90 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0 ) @@ -277,7 +277,7 @@ groups: {{ $value }} seconds for the bucket operations. expr: | ( - histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 15 + histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0 ) @@ -468,7 +468,7 @@ groups: $value }} seconds for the replicate operations. expr: | ( - histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 120 + histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0 ) diff --git a/mixin/thanos/alerts/bucket_replicate.libsonnet b/mixin/thanos/alerts/bucket_replicate.libsonnet index b46ff18e83..4c1a2f27df 100644 --- a/mixin/thanos/alerts/bucket_replicate.libsonnet +++ b/mixin/thanos/alerts/bucket_replicate.libsonnet @@ -3,6 +3,8 @@ bucket_replicate+:: { jobPrefix: error 'must provide job prefix for Thanos Bucket Replicate dashboard', selector: error 'must provide selector for Thanos Bucket Replicate dashboard', + errorThreshold: 10, + p99LatencyThreshold: 20, }, prometheusAlerts+:: { groups+: [ @@ -32,7 +34,7 @@ sum(rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[5m])) / on (namespace) group_left sum(rate(thanos_replicate_replication_runs_total{%(selector)s}[5m])) - ) * 100 >= 10 + ) * 100 >= %(errorThreshold)s ||| % thanos.bucket_replicate, 'for': '5m', labels: { @@ -46,7 +48,7 @@ }, expr: ||| ( - histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > 120 + histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > %(p99LatencyThreshold)s and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0 ) diff --git a/mixin/thanos/alerts/compact.libsonnet b/mixin/thanos/alerts/compact.libsonnet index 04c1115f2d..fd5b82f7d3 100644 --- a/mixin/thanos/alerts/compact.libsonnet +++ b/mixin/thanos/alerts/compact.libsonnet @@ -3,6 +3,8 @@ compact+:: { jobPrefix: error 'must provide job prefix for Thanos Compact alerts', selector: error 'must provide selector for Thanos Compact alerts', + compactionErrorThreshold: 5, + bucketOpsErrorThreshold: 5, }, prometheusAlerts+:: { groups+: [ @@ -41,7 +43,7 @@ sum by (job) (rate(thanos_compact_group_compactions_failures_total{%(selector)s}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(compactionErrorThreshold)s ) ||| % thanos.compact, 'for': '15m', @@ -59,7 +61,7 @@ sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(bucketOpsErrorThreshold)s ) ||| % thanos.compact, 'for': '15m', diff --git a/mixin/thanos/alerts/query.libsonnet b/mixin/thanos/alerts/query.libsonnet index f563ab14fd..4cfd7ab031 100644 --- a/mixin/thanos/alerts/query.libsonnet +++ b/mixin/thanos/alerts/query.libsonnet @@ -3,6 +3,11 @@ query+:: { jobPrefix: error 'must provide job prefix for Thanos Query alerts', selector: error 'must provide selector for Thanos Query alerts', + httpErrorThreshold: 5, + grpcErrorThreshold: 5, + dnsErrorThreshold: 1, + p99QueryLatencyThreshold: 40, + p99QueryRangeLatencyThreshold: 90, }, prometheusAlerts+:: { groups+: [ @@ -19,7 +24,7 @@ sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query"}[5m])) / sum(rate(http_requests_total{%(selector)s, handler="query"}[5m])) - ) * 100 > 5 + ) * 100 > %(httpErrorThreshold)s ||| % thanos.query, 'for': '5m', labels: { @@ -36,7 +41,7 @@ sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="query_range"}[5m])) / sum(rate(http_requests_total{%(selector)s, handler="query_range"}[5m])) - ) * 100 > 5 + ) * 100 > %(httpErrorThreshold)s ||| % thanos.query, 'for': '5m', labels: { @@ -53,7 +58,7 @@ sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) / sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(grpcErrorThreshold)s ) ||| % thanos.query, 'for': '5m', @@ -71,7 +76,7 @@ sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", %(selector)s}[5m])) / sum by (job) (rate(grpc_client_started_total{%(selector)s}[5m])) - ) * 100 > 5 + ) * 100 > %(grpcErrorThreshold)s ||| % thanos.query, 'for': '5m', labels: { @@ -88,7 +93,7 @@ sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{%(selector)s}[5m])) / sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{%(selector)s}[5m])) - ) * 100 > 1 + ) * 100 > %(dnsErrorThreshold)s ||| % thanos.query, 'for': '15m', labels: { @@ -102,7 +107,7 @@ }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > 90 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > %(p99QueryLatencyThreshold)s and sum by (job) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) > 0 ) @@ -119,7 +124,7 @@ }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > 90 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > %(p99QueryRangeLatencyThreshold)s and sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="query_range"}[5m])) > 0 ) diff --git a/mixin/thanos/alerts/receive.libsonnet b/mixin/thanos/alerts/receive.libsonnet index 7a5451ac3d..b9920e6552 100644 --- a/mixin/thanos/alerts/receive.libsonnet +++ b/mixin/thanos/alerts/receive.libsonnet @@ -3,6 +3,10 @@ receive+:: { jobPrefix: error 'must provide job prefix for Thanos Receive alerts', selector: error 'must provide selector for Thanos Receive alerts', + httpErrorThreshold: 5, + forwardErrorThreshold: 5, + refreshErrorThreshold: 0, + p99LatencyThreshold: 10, }, prometheusAlerts+:: { groups+: [ @@ -19,7 +23,7 @@ sum(rate(http_requests_total{code=~"5..", %(selector)s, handler="receive"}[5m])) / sum(rate(http_requests_total{%(selector)s, handler="receive"}[5m])) - ) * 100 > 5 + ) * 100 > %(httpErrorThreshold)s ||| % thanos.receive, 'for': '5m', labels: { @@ -33,7 +37,7 @@ }, expr: ||| ( - histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > 10 + histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > %(p99LatencyThreshold)s and sum by (job) (rate(http_request_duration_seconds_count{%(selector)s, handler="receive"}[5m])) > 0 ) @@ -53,7 +57,7 @@ sum by (job) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) / sum by (job) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(forwardErrorThreshold)s ) ||| % thanos.receive, 'for': '5m', @@ -71,7 +75,7 @@ sum by (job) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) - > 0 + > %(refreshErrorThreshold)s ) ||| % thanos.receive, 'for': '15m', diff --git a/mixin/thanos/alerts/rule.libsonnet b/mixin/thanos/alerts/rule.libsonnet index 6d562bc9d4..d9795eefa8 100644 --- a/mixin/thanos/alerts/rule.libsonnet +++ b/mixin/thanos/alerts/rule.libsonnet @@ -3,6 +3,10 @@ rule+:: { jobPrefix: error 'must provide job prefix for Thanos Rule alerts', selector: error 'must provide selector for Thanos Rule alerts', + grpcErrorThreshold: 5, + rulerDnsErrorThreshold: 1, + alertManagerDnsErrorThreshold: 1, + evalErrorThreshold: 5, }, prometheusAlerts+:: { groups+: [ @@ -45,7 +49,7 @@ sum by (job) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m])) / sum by (job) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(evalErrorThreshold)s ) ||| % thanos.rule, @@ -95,7 +99,7 @@ sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) / sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(grpcErrorThreshold)s ) ||| % thanos.rule, 'for': '5m', @@ -124,7 +128,7 @@ sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{%(selector)s}[5m])) / sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{%(selector)s}[5m])) - * 100 > 1 + * 100 > %(rulerDnsErrorThreshold)s ) ||| % thanos.rule, 'for': '15m', @@ -142,7 +146,7 @@ sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{%(selector)s}[5m])) / sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{%(selector)s}[5m])) - * 100 > 1 + * 100 > %(alertManagerDnsErrorThreshold)s ) ||| % thanos.rule, 'for': '15m', diff --git a/mixin/thanos/alerts/store.libsonnet b/mixin/thanos/alerts/store.libsonnet index 961870496c..a30f71c95d 100644 --- a/mixin/thanos/alerts/store.libsonnet +++ b/mixin/thanos/alerts/store.libsonnet @@ -3,6 +3,11 @@ store+:: { jobPrefix: error 'must provide job prefix for Thanos Store alerts', selector: error 'must provide selector for Thanos Store alerts', + grpcErrorThreshold: 5, + compactionErrorThreshold: 5, + seriesGateErrorThreshold: 2, + bucketOpsErrorThreshold: 5, + bucketOpsP99LatencyThreshold: 2, }, prometheusAlerts+:: { groups+: [ @@ -19,7 +24,7 @@ sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) / sum by (job) (rate(grpc_server_started_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(grpcErrorThreshold)s ) ||| % thanos.store, 'for': '5m', @@ -34,7 +39,7 @@ }, expr: ||| ( - histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > 2 + histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{%(selector)s}[5m]))) > %(seriesGateErrorThreshold)s and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{%(selector)s}[5m])) > 0 ) @@ -54,7 +59,7 @@ sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m])) - * 100 > 5 + * 100 > %(bucketOpsErrorThreshold)s ) ||| % thanos.store, 'for': '15m', @@ -69,7 +74,7 @@ }, expr: ||| ( - histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > 15 + histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{%(selector)s}[5m]))) > %(bucketOpsP99LatencyThreshold)s and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{%(selector)s}[5m])) > 0 )