From eb52c518d5bfe21f5891cfd2a36ed7dfc1a86468 Mon Sep 17 00:00:00 2001
From: Gerrit <Gerrit91@users.noreply.github.com>
Date: Mon, 16 Oct 2023 14:58:18 +0200
Subject: [PATCH] Add machine issues to alerts and dashoard. (#222)

---
 .../grafana-dashboards/metal-api.yaml         | 80 +++++++++++++++++--
 .../templates/prometheus-stack-values.yaml    | 64 ++++++++++++++-
 2 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml b/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml
index e0defc59..ff8bd33b 100644
--- a/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml
+++ b/control-plane/roles/monitoring/templates/grafana-dashboards/metal-api.yaml
@@ -982,13 +982,81 @@ data:
           "title": "Machines",
           "type": "stat"
         },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "description": "Accumulated number of machine issues.",
+          "fieldConfig": {
+            "defaults": {
+              "color": {
+                "mode": "thresholds"
+              },
+              "mappings": [],
+              "thresholds": {
+                "mode": "absolute",
+                "steps": [
+                  {
+                    "color": "green",
+                    "value": null
+                  },
+                  {
+                    "color": "red",
+                    "value": 1
+                  }
+                ]
+              }
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 7,
+            "w": 24,
+            "x": 0,
+            "y": 42
+          },
+          "id": 34,
+          "options": {
+            "displayMode": "gradient",
+            "minVizHeight": 10,
+            "minVizWidth": 0,
+            "orientation": "auto",
+            "reduceOptions": {
+              "calcs": [
+                "lastNotNull"
+              ],
+              "fields": "",
+              "values": false
+            },
+            "showUnfilled": true,
+            "valueMode": "color"
+          },
+          "pluginVersion": "10.0.1",
+          "targets": [
+            {
+              "datasource": {
+                "type": "prometheus",
+                "uid": "prometheus"
+              },
+              "editorMode": "code",
+              "expr": "sum by (issueid) (metal_machine_issues)",
+              "instant": false,
+              "legendFormat": "{{issueid}}",
+              "range": true,
+              "refId": "A"
+            }
+          ],
+          "title": "Machine Issues",
+          "type": "bargauge"
+        },
         {
           "collapsed": false,
           "gridPos": {
             "h": 1,
             "w": 24,
             "x": 0,
-            "y": 42
+            "y": 49
           },
           "id": 24,
           "panels": [],
@@ -1027,7 +1095,7 @@ data:
             "h": 4,
             "w": 12,
             "x": 0,
-            "y": 43
+            "y": 50
           },
           "id": 27,
           "options": {
@@ -1094,7 +1162,7 @@ data:
             "h": 4,
             "w": 12,
             "x": 12,
-            "y": 43
+            "y": 50
           },
           "id": 28,
           "options": {
@@ -1190,7 +1258,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 47
+            "y": 54
           },
           "id": 23,
           "options": {
@@ -1289,7 +1357,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 47
+            "y": 54
           },
           "id": 29,
           "options": {
@@ -1350,7 +1418,7 @@ data:
       "timezone": "",
       "title": "metal-api",
       "uid": "metal-api",
-      "version": 3,
+      "version": 4,
       "weekStart": ""
     }
     {% endraw %}
diff --git a/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml b/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml
index e5ba5fc4..6ac3d854 100644
--- a/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml
+++ b/control-plane/roles/monitoring/templates/prometheus-stack-values.yaml
@@ -279,6 +279,26 @@ kubeControllerManager:
 
 {% raw %}
 additionalPrometheusRulesMap:
+  "alertmanager.rules":
+    groups:
+    - name: metal-api.rules
+      rules:
+      - alert: PrometheusErrorSendingAlertsToAlertmanager
+        annotations:
+          description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
+            {{ $labels.instance }} to Alertmanager {{ $labels.alertmanager }}.'
+          summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
+        expr: |
+          (
+            rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
+          /
+            rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
+          )
+          * 100
+          > 1
+        for: 15m
+        labels:
+          severity: critical
   "metal-api.rules":
     groups:
     - name: metal-api.rules
@@ -312,6 +332,48 @@ additionalPrometheusRulesMap:
         annotations:
           summary: "Metal API response status code not OK"
           description: "The metal_api response status code is not in the expected range."
+      - alert: MetalFailedMachineReclaim
+        expr: (metal_machine_issues{issueid="failed-machine-reclaim"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` did not go back into waiting machine pool after freeing the machine. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
+      - alert: MetalMachineHasNoEventContainer
+        expr: (metal_machine_issues{issueid="no-event-container"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` has no event container. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
+      - alert: MetalBmcInfoOutdated
+        expr: (metal_machine_issues{issueid="bmc-info-outdated"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` does not receive BMC updates. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
+      - alert: MetalBmcNoDistinctIP
+        expr: (metal_machine_issues{issueid="bmc-no-distinct-ip"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` has no distinct BMC address. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
+      - alert: MetalMachineCrashloop
+        expr: (metal_machine_issues{issueid="crashloop"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` is in a provisioning crashloop. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
+      - alert: MetalMachineDead
+        expr: (metal_machine_issues{issueid="liveliness-dead"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` is dead. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})."
     - name: metal-api-recording.rules
       rules:
       - record: frr:instance:metal_switch_interface_info
@@ -327,7 +389,7 @@ additionalPrometheusRulesMap:
         labels:
           severity: "warning"
         annotations:
-          description: "Partition {{ $labels.partition }} has {{ $value }} DEAD Machines"
+          description: "Partition {{ $labels.partition }} has {{ $value }} dead machines."
       - alert: MachineCapacityLow
         expr: (avg(metal_partition_capacity_free{size!="unknown"} > 5) by (partition, size) / avg(metal_partition_capacity_total{size!="unknown"} > 5) by (partition, size)) * 100 < 10
         for: 10m