conf-example.yml

---
cluster_variables:
  warning_notifications: "@slack"
rulesets:
- type: deployment
  match_annotations:
    - name: astro/owner
      value: astro
  monitors:
    deploy-replica-alert:
      name: "Deployment Replica Alert - {{ .ObjectMeta.Name }}"
      type: metric alert
      query: "max(last_10m):max:kubernetes_state.deployment.replicas_available{namespace:{{ .ObjectMeta.Namespace }}} by {deployment} <= 0"
      message: |-
        {{ "{{#is_alert}}" }}
        Available replicas is currently 0 for {{ .ObjectMeta.Name }}
        {{ "{{/is_alert}}" }}
        {{ "{{^is_alert}}" }}
        Available replicas is no longer 0 for {{ .ObjectMeta.Name }}
        {{ "{{/is_alert}}" }}
      tags: []
      options:
        no_data_timeframe: 60
        notify_audit: false
        notify_no_data: false
        renotify_interval: 5
        new_host_delay: 5
        evaluation_delay: 300
        timeout_h: 1
        escalation_message: ""
        thresholds:
          critical: 0
        require_full_window: true
        locked: false
- type: binding
  bound_objects:
    - deployment
  match_annotations:
    - name: astro/admin
      value: fairwinds
  monitors:
    bound-deploy-replica-alert:
      name: "Deployment Replica Alert - {{ .ObjectMeta.Name }}"
      type: metric alert
      query: "max(last_10m):max:kubernetes_state.deployment.replicas_available{deployment:{{ .ObjectMeta.Name }}} <= 0"
      message: |-
        {{ "{{#is_alert}}" }}
        Available replicas is currently 0 for {{ "{{deployment.name}}" }}
        {{ "{{/is_alert}}" }}
        {{ "{{^is_alert}}" }}
        Available replicas is no longer 0 for {{ "{{deployment.name}}" }}
        {{ "{{/is_alert}}" }}
        {{ ClusterVariables.warning_notifications }}
      tags: []
      options:
        notify_audit: false
        notify_no_data: false
        new_host_delay: 300
        thresholds:
          critical: 0
        locked: false
- type: namespace
  match_annotations:
    - name: astro/admin-bound
      value: "true"
  monitors:
    ns-pending-pods:
      name: "Pending Pods - {{ .ObjectMeta.Name }}"
      type: query alert
      query: "min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running,namespace:{{ .ObjectMeta.Name }}} - sum:kubernetes_state.pod.status_phase{phase:running,namespace:{{ .ObjectMeta.Name }}} + sum:kubernetes_state.pod.status_phase{phase:pending,namespace:{{ .ObjectMeta.Name }}}.fill(zero) >= 1"
      message: |-
        {{ "{{#is_alert}}" }}
        There has been at least 1 pod Pending for 30 minutes.
        There are currently {{ "{{value}}" }} pods Pending.
          - Is something crash-looping?
          - Is autoscaling adding node capacity where needed?
          - Is a secret or a configmap missing?
        {{ "{{/is_alert}}" }}
        {{ "{{^is_alert}}" }}
        Pods are no longer pending.
        {{ "{{/is_alert}}" }}
      tags: []
      options:
        notify_audit: false
        notify_no_data: false
        new_host_delay: 300
        thresholds:
          critical: 1.0
        locked: false
- type: namespace
  match_annotations:
    - name: astro/admin
      value: fairwinds
  monitors:
    ns-increased-pod-crash:
      name: "Increased Pod Crashes - {{ .ObjectMeta.Name }}"
      type: query alert
      query: "avg(last_5m):avg:kubernetes_state.container.restarts{namespace:{{ .ObjectMeta.Name }}} by {pod} - hour_before(avg:kubernetes_state.container.restarts{namespace:{{ .ObjectMeta.Name }}} by {pod}) > 3"
      message: |-
        {{ "{{#is_alert}}" }}
        {{ "{{pod.name}}" }} has crashed repeatedly over the last hour
        {{ "{{/is_alert}}" }}
        {{ "{{^is_alert}}" }}
        {{ "{{pod.name}}" }} appears to have stopped crashing
        {{ "{{/is_alert}}" }}
      tags: []
      options:
        notify_audit: false
        notify_no_data: false
        thresholds:
          critical: 3
        locked: false