From eb52c518d5bfe21f5891cfd2a36ed7dfc1a86468 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Mon, 16 Oct 2023 14:58:18 +0200 Subject: [PATCH] Add machine issues to alerts and dashoard. (#222) --- .../grafana-dashboards/metal-api.yaml | 80 +++++++++++++++++-- .../templates/prometheus-stack-values.yaml | 64 ++++++++++++++- 2 files changed, 137 insertions(+), 7 deletions(-) diff --git a/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml b/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml index e0defc59..ff8bd33b 100644 --- a/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml +++ b/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml @@ -982,13 +982,81 @@ data: "title": "Machines", "type": "stat" }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Accumulated number of machine issues.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 34, + "options": { + "displayMode": "gradient", + "minVizHeight": 10, + "minVizWidth": 0, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "valueMode": "color" + }, + "pluginVersion": "10.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (issueid) (metal_machine_issues)", + "instant": false, + "legendFormat": "{{issueid}}", + "range": true, + "refId": "A" + } + ], + "title": "Machine Issues", + "type": "bargauge" + }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 42 + "y": 49 }, "id": 24, "panels": [], @@ -1027,7 +1095,7 @@ data: "h": 4, "w": 12, "x": 0, - "y": 43 + "y": 50 }, "id": 27, "options": { @@ -1094,7 +1162,7 @@ data: "h": 4, "w": 12, "x": 12, - "y": 43 + "y": 50 }, "id": 28, "options": { @@ -1190,7 +1258,7 @@ data: "h": 8, "w": 12, "x": 0, - "y": 47 + "y": 54 }, "id": 23, "options": { @@ -1289,7 +1357,7 @@ data: "h": 8, "w": 12, "x": 12, - "y": 47 + "y": 54 }, "id": 29, "options": { @@ -1350,7 +1418,7 @@ data: "timezone": "", "title": "metal-api", "uid": "metal-api", - "version": 3, + "version": 4, "weekStart": "" } {% endraw %} diff --git a/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml b/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml index e5ba5fc4..6ac3d854 100644 --- a/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml +++ b/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml @@ -279,6 +279,26 @@ kubeControllerManager: {% raw %} additionalPrometheusRulesMap: + "alertmanager.rules": + groups: + - name: metal-api.rules + rules: + - alert: PrometheusErrorSendingAlertsToAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus + {{ $labels.instance }} to Alertmanager {{ $labels.alertmanager }}.' + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical "metal-api.rules": groups: - name: metal-api.rules @@ -312,6 +332,48 @@ additionalPrometheusRulesMap: annotations: summary: "Metal API response status code not OK" description: "The metal_api response status code is not in the expected range." + - alert: MetalFailedMachineReclaim + expr: (metal_machine_issues{issueid="failed-machine-reclaim"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info + for: 5m + labels: + severity: warning + annotations: + description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` did not go back into waiting machine pool after freeing the machine. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})" + - alert: MetalMachineHasNoEventContainer + expr: (metal_machine_issues{issueid="no-event-container"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info + for: 5m + labels: + severity: warning + annotations: + description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` has no event container. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})" + - alert: MetalBmcInfoOutdated + expr: (metal_machine_issues{issueid="bmc-info-outdated"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info + for: 5m + labels: + severity: warning + annotations: + description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` does not receive BMC updates. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})" + - alert: MetalBmcNoDistinctIP + expr: (metal_machine_issues{issueid="bmc-no-distinct-ip"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info + for: 5m + labels: + severity: warning + annotations: + description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` has no distinct BMC address. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})" + - alert: MetalMachineCrashloop + expr: (metal_machine_issues{issueid="crashloop"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info + for: 5m + labels: + severity: warning + annotations: + description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` is in a provisioning crashloop. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})" + - alert: MetalMachineDead + expr: (metal_machine_issues{issueid="liveliness-dead"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info + for: 5m + labels: + severity: warning + annotations: + description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` is dead. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})." - name: metal-api-recording.rules rules: - record: frr:instance:metal_switch_interface_info @@ -327,7 +389,7 @@ additionalPrometheusRulesMap: labels: severity: "warning" annotations: - description: "Partition {{ $labels.partition }} has {{ $value }} DEAD Machines" + description: "Partition {{ $labels.partition }} has {{ $value }} dead machines." - alert: MachineCapacityLow expr: (avg(metal_partition_capacity_free{size!="unknown"} > 5) by (partition, size) / avg(metal_partition_capacity_total{size!="unknown"} > 5) by (partition, size)) * 100 < 10 for: 10m