Skip to content

Commit

Permalink
Add machine issues to alerts and dashoard. (#222)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gerrit91 authored Oct 16, 2023
1 parent 2f26823 commit eb52c51
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -982,13 +982,81 @@ data:
"title": "Machines",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"description": "Accumulated number of machine issues.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 42
},
"id": 34,
"options": {
"displayMode": "gradient",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showUnfilled": true,
"valueMode": "color"
},
"pluginVersion": "10.0.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum by (issueid) (metal_machine_issues)",
"instant": false,
"legendFormat": "{{issueid}}",
"range": true,
"refId": "A"
}
],
"title": "Machine Issues",
"type": "bargauge"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 42
"y": 49
},
"id": 24,
"panels": [],
Expand Down Expand Up @@ -1027,7 +1095,7 @@ data:
"h": 4,
"w": 12,
"x": 0,
"y": 43
"y": 50
},
"id": 27,
"options": {
Expand Down Expand Up @@ -1094,7 +1162,7 @@ data:
"h": 4,
"w": 12,
"x": 12,
"y": 43
"y": 50
},
"id": 28,
"options": {
Expand Down Expand Up @@ -1190,7 +1258,7 @@ data:
"h": 8,
"w": 12,
"x": 0,
"y": 47
"y": 54
},
"id": 23,
"options": {
Expand Down Expand Up @@ -1289,7 +1357,7 @@ data:
"h": 8,
"w": 12,
"x": 12,
"y": 47
"y": 54
},
"id": 29,
"options": {
Expand Down Expand Up @@ -1350,7 +1418,7 @@ data:
"timezone": "",
"title": "metal-api",
"uid": "metal-api",
"version": 3,
"version": 4,
"weekStart": ""
}
{% endraw %}
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,26 @@ kubeControllerManager:

{% raw %}
additionalPrometheusRulesMap:
"alertmanager.rules":
groups:
- name: metal-api.rules
rules:
- alert: PrometheusErrorSendingAlertsToAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
{{ $labels.instance }} to Alertmanager {{ $labels.alertmanager }}.'
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: critical
"metal-api.rules":
groups:
- name: metal-api.rules
Expand Down Expand Up @@ -312,6 +332,48 @@ additionalPrometheusRulesMap:
annotations:
summary: "Metal API response status code not OK"
description: "The metal_api response status code is not in the expected range."
- alert: MetalFailedMachineReclaim
expr: (metal_machine_issues{issueid="failed-machine-reclaim"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
for: 5m
labels:
severity: warning
annotations:
description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` did not go back into waiting machine pool after freeing the machine. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
- alert: MetalMachineHasNoEventContainer
expr: (metal_machine_issues{issueid="no-event-container"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
for: 5m
labels:
severity: warning
annotations:
description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` has no event container. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
- alert: MetalBmcInfoOutdated
expr: (metal_machine_issues{issueid="bmc-info-outdated"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
for: 5m
labels:
severity: warning
annotations:
description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` does not receive BMC updates. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
- alert: MetalBmcNoDistinctIP
expr: (metal_machine_issues{issueid="bmc-no-distinct-ip"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
for: 5m
labels:
severity: warning
annotations:
description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` has no distinct BMC address. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
- alert: MetalMachineCrashloop
expr: (metal_machine_issues{issueid="crashloop"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
for: 5m
labels:
severity: warning
annotations:
description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` is in a provisioning crashloop. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})"
- alert: MetalMachineDead
expr: (metal_machine_issues{issueid="liveliness-dead"} == 1) * on (machineid) group_left(partition) metal_machine_allocation_info * on (issueid) group_left(description,severity,refurl) metal_machine_issues_info
for: 5m
labels:
severity: warning
annotations:
description: "Machine `{{ $labels.machineid }}` in partition `{{ $labels.partition }}` is dead. Use `metalctl machine issues` for further inspection ({{ $labels.refurl }})."
- name: metal-api-recording.rules
rules:
- record: frr:instance:metal_switch_interface_info
Expand All @@ -327,7 +389,7 @@ additionalPrometheusRulesMap:
labels:
severity: "warning"
annotations:
description: "Partition {{ $labels.partition }} has {{ $value }} DEAD Machines"
description: "Partition {{ $labels.partition }} has {{ $value }} dead machines."
- alert: MachineCapacityLow
expr: (avg(metal_partition_capacity_free{size!="unknown"} > 5) by (partition, size) / avg(metal_partition_capacity_total{size!="unknown"} > 5) by (partition, size)) * 100 < 10
for: 10m
Expand Down

0 comments on commit eb52c51

Please sign in to comment.