Skip to content

Commit

Permalink
deploy: add options for prometheus monitoring
Browse files Browse the repository at this point in the history
Adds a monitoring configuration for users of Prometheus Operator.
This includes Monitors for LINSTOR Controller and Satellites, as
well as a alerting rules and a grafana dashboard.

Signed-off-by: Moritz Wanzenböck <moritz.wanzenboeck@linbit.com>
  • Loading branch information
WanzenBug committed Oct 25, 2023
1 parent 0b7273a commit cc09ab9
Show file tree
Hide file tree
Showing 7 changed files with 2,304 additions and 26 deletions.
94 changes: 94 additions & 0 deletions config/extras/monitoring/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: piraeus-datastore
spec:
groups:
- name: linstor.rules
rules:
- alert: linstorSatelliteErrorRate
annotations:
description: |
LINSTOR Satellite "{{ $labels.hostname }}" reports {{ $value }} errors in the last 15 minutes.
Use "linstor error-reports list --nodes {{ $labels.hostname }} --since 15minutes" to see them.
expr: increase(linstor_error_reports_count{module="SATELLITE"}[15m]) > 0
labels:
severity: warning
- alert: linstorControllerErrorRate
annotations:
description: |
LINSTOR Controller reports {{ $value }} errors in the last 15 minutes.
Use "linstor error-reports list --since 15minutes" to see them.
expr: increase(linstor_error_reports_count{module="CONTROLLER"}[15m]) > 0
labels:
severity: warning
- alert: linstorSatelliteNotOnline
annotations:
description: |
LINSTOR Satellite "{{ $labels.hostname }}" is not ONLINE.
Check that the Satellite is running and reachable from the LINSTOR Controller.
expr: linstor_node_state{nodetype="SATELLITE"} != 2
labels:
severity: critical
- alert: linstorStoragePoolErrors
annotations:
description: |
Storage pool "{{ $labels.storage_pool }}" on node "{{ $labels.node }}" ({{ $labels.driver }}={{ $labels.backing_pool }}) is reporting errors.
expr: linstor_storage_pool_error_count > 0
labels:
severity: critical
- alert: linstorStoragePoolAtCapacity
annotations:
description: |
Storage pool "{{ $labels.storage_pool }}" on node "{{ $labels.node }}" ({{ $labels.driver }}={{ $labels.backing_pool }}) has less than 5% free space available.
expr: ( linstor_storage_pool_capacity_free_bytes / linstor_storage_pool_capacity_total_bytes ) < 0.05
labels:
severity: warn
- name: drbd.rules
rules:
- alert: drbdConnectionNotConnected
annotations:
description: |
DRBD Resource "{{ $labels.name }}" on "{{ $labels.pod }}" is not connected to "{{ $labels.conn_name }}": {{ $labels.drbd_connection_state }}.
expr: drbd_connection_state{drbd_connection_state!="Connected"} > 0
labels:
severity: warn
- alert: drbdDeviceNotUpToDate
annotations:
description: |
DRBD device "{{ $labels.name }}" on "{{ $labels.pod }}" has unexpected device state "{{ $labels.drbd_device_state }}".
expr: drbd_device_state{drbd_device_state!~"UpToDate|Diskless"} > 0
labels:
severity: warn
- alert: drbdDeviceUnintentionalDiskless
annotations:
description: |
DRBD device "{{ $labels.name }}" on "{{ $labels.pod }}" is unintenionally diskless.
This usually indicates IO errors reported on the backing device. Check the kernel log.
expr: drbd_device_unintentionaldiskless > 0
labels:
severity: warn
- alert: drbdDeviceWithoutQuorum
annotations:
description: |
DRBD device "{{ $labels.name }}" on "{{ $labels.pod }}" has no quorum.
This usually indicates connectivity issues.
expr: drbd_device_quorum == 0
labels:
severity: warn
- alert: drbdResourceSuspended
annotations:
description: |
DRBD resource "{{ $labels.name }}" on "{{ $labels.pod }}" has been suspended for 1m.
for: 1m
expr: drbd_resource_suspended > 0
labels:
severity: warn
- alert: drbdResourceResyncWithoutProgress
annotations:
description: |
DRBD resource "{{ $labels.name }}" on "{{ $labels.pod }}" has been resyncing without progress for 5 minutes.
expr: drbd_device_state{drbd_device_state="Inconsistent"} and delta(drbd_peerdevice_outofsync_bytes[5m]) >= 0
labels:
severity: warn
17 changes: 17 additions & 0 deletions config/extras/monitoring/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- linstor-controller-monitor.yaml
- linstor-satellite-monitor.yaml
- alerts.yaml

generatorOptions:
disableNameSuffixHash: true
configMapGenerator:
- name: piraeus-datastore-dashboard
namespace: prometheus
options:
labels:
grafana_dashboard: "1"
files:
- piraeus-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: linstor-controller
namespace: system
labels:
app.kubernetes.io/component: linstor-controller
spec:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: linstor-satellite
namespace: system
labels:
app.kubernetes.io/component: linstor-satellite
spec:
Expand Down
Loading

0 comments on commit cc09ab9

Please sign in to comment.