From 660a3d0ae27959878d17483bb7f6024851f7f2c3 Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Thu, 15 Aug 2024 12:20:53 +0200 Subject: [PATCH 1/2] Add alert rules to dex-auth based on the KF093 spec --- .../KubeflowDexAuthServices.rules | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 src/prometheus_alert_rules/KubeflowDexAuthServices.rules diff --git a/src/prometheus_alert_rules/KubeflowDexAuthServices.rules b/src/prometheus_alert_rules/KubeflowDexAuthServices.rules new file mode 100644 index 00000000..8f5e46ef --- /dev/null +++ b/src/prometheus_alert_rules/KubeflowDexAuthServices.rules @@ -0,0 +1,24 @@ +groups: +- name: KubeflowDexAuthServices + rules: + - alert: KubeflowServiceDown + expr: up{} < 1 + for: 5m + labels: + severity: critical + annotations: + summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}. + LABELS = {{ $labels }} + + - alert: KubeflowServiceIsNotStable + expr: avg_over_time(up{}[10m]) < 0.5 + for: 0m + labels: + severity: warning + annotations: + summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + {{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes. + LABELS = {{ $labels }} From 4097ddbc2f610b48d4ccde56b77f90019648eaaf Mon Sep 17 00:00:00 2001 From: Robert Gildein Date: Thu, 15 Aug 2024 12:29:32 +0200 Subject: [PATCH 2/2] Delete src/prometheus_alert_rules/unit_unavailable.rule --- src/prometheus_alert_rules/unit_unavailable.rule | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 src/prometheus_alert_rules/unit_unavailable.rule diff --git a/src/prometheus_alert_rules/unit_unavailable.rule b/src/prometheus_alert_rules/unit_unavailable.rule deleted file mode 100644 index f76a2284..00000000 --- a/src/prometheus_alert_rules/unit_unavailable.rule +++ /dev/null @@ -1,10 +0,0 @@ -alert: DexAuthUnitIsUnavailable -expr: up < 1 -for: 5m -labels: - severity: critical -annotations: - summary: Dex-auth unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} unavailable - description: > - The dex-auth unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable - LABELS = {{ $labels }}