Skip to content

Commit

Permalink
OCPBUGS-xx: add runbook_url annotations
Browse files Browse the repository at this point in the history
Signed-off-by: Simon Pasquier <spasquie@redhat.com>
  • Loading branch information
simonpasquier committed Jul 4, 2024
1 parent b769e66 commit fb95a0c
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 1 deletion.
2 changes: 2 additions & 0 deletions assets/prometheus-k8s/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ spec:
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }}
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusRemoteStorageFailures.md
summary: Prometheus fails to send samples to remote storage.
expr: |
(
Expand Down Expand Up @@ -220,6 +221,7 @@ spec:
- alert: PrometheusScrapeBodySizeLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusScrapeBodySizeLimitHit.md
summary: Prometheus has dropped some targets that exceeded body size limit.
expr: |
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0
Expand Down
1 change: 1 addition & 0 deletions assets/prometheus-operator/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ spec:
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/PrometheusOperatorRejectedResources.md
summary: Resources rejected by Prometheus operator
expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator", namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[5m]) > 0
Expand Down
5 changes: 4 additions & 1 deletion jsonnet/utils/sanitize-rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -430,9 +430,9 @@ local openShiftRunbookCMO(runbook) =
openShiftRunbook('alerts/cluster-monitoring-operator/' + runbook);

local includeRunbooks = {
AlertmanagerClusterFailedToSendAlerts: openShiftRunbookCMO('AlertmanagerClusterFailedToSendAlerts.md'),
AlertmanagerFailedReload: openShiftRunbookCMO('AlertmanagerFailedReload.md'),
AlertmanagerFailedToSendAlerts: openShiftRunbookCMO('AlertmanagerFailedToSendAlerts.md'),
AlertmanagerClusterFailedToSendAlerts: openShiftRunbookCMO('AlertmanagerClusterFailedToSendAlerts.md'),
ClusterOperatorDegraded: openShiftRunbookCMO('ClusterOperatorDegraded.md'),
ClusterOperatorDown: openShiftRunbookCMO('ClusterOperatorDown.md'),
KubeAPIDown: openShiftRunbookCMO('KubeAPIDown.md'),
Expand All @@ -450,7 +450,10 @@ local includeRunbooks = {
NodeFilesystemSpaceFillingUp: openShiftRunbookCMO('NodeFilesystemSpaceFillingUp.md'),
NodeRAIDDegraded: openShiftRunbookCMO('NodeRAIDDegraded.md'),
NodeClockNotSynchronising: openShiftRunbookCMO('NodeClockNotSynchronising.md'),
PrometheusOperatorRejectedResources: openShiftRunbookCMO('PrometheusOperatorRejectedResources.md'),
PrometheusRuleFailures: openShiftRunbookCMO('PrometheusRuleFailures.md'),
PrometheusRemoteStorageFailures: openShiftRunbookCMO('PrometheusRemoteStorageFailures.md'),
PrometheusScrapeBodySizeLimitHit: openShiftRunbookCMO('PrometheusScrapeBodySizeLimitHit.md'),
PrometheusTargetSyncFailure: openShiftRunbookCMO('PrometheusTargetSyncFailure.md'),
ThanosRuleQueueIsDroppingAlerts: openShiftRunbookCMO('ThanosRuleQueueIsDroppingAlerts.md'),
ThanosRuleRuleEvaluationLatencyHigh: openShiftRunbookCMO('ThanosRuleRuleEvaluationLatencyHigh.md'),
Expand Down

0 comments on commit fb95a0c

Please sign in to comment.