bird-house · tlvu · Jul 11, 2020 · Jul 3, 2020 · Jul 3, 2020 · Jul 3, 2020
@@ -1,3 +1,5 @@
 prometheus.yml
 grafana_datasources.yml
 grafana_dashboards.yml
+alertmanager.yml
+prometheus.rules
@@ -0,0 +1,74 @@
+# https://prometheus.io/docs/alerting/latest/configuration/
+# http://${PAVICS_FQDN}:9093/#/status
+global:
+  # The smarthost and SMTP sender used for mail notifications.
+  smtp_smarthost: '${SMTP_SERVER}'
+  smtp_from: 'alertmanager@${PAVICS_FQDN}'
+  smtp_hello: '${PAVICS_FQDN}'
+${ALERTMANAGER_EXTRA_GLOBAL}
+# Below example of candidates for ALERTMANAGER_EXTRA_GLOBAL
+#  smtp_auth_username: 'alertmanager'
+#  smtp_auth_password: 'password'
+#  smtp_require_tls: false
+
+# The directory from which notification templates are read.
+templates:
+- '/etc/alertmanager/template/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  #
+  # To aggregate by all possible labels use '...' as the sole label name.
+  # This effectively disables aggregation entirely, passing through all
+  # alerts as-is. This is unlikely to be what you want, unless you have
+  # a very low alert volume or your upstream notification system performs
+  # its own grouping. Example: group_by: [...]
+  group_by: ['alertname', 'cluster', 'service']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 6h
+
+  # A default receiver
+  receiver: admin-emails
+
+${ALERTMANAGER_EXTRA_ROUTES}
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+- source_match:
+    severity: 'critical'
+  target_match:
+    severity: 'warning'
+  # Apply inhibition if the alertname is the same.
+  # CAUTION:
+  #   If all label names listed in `equal` are missing
+  #   from both the source and target alerts,
+  #   the inhibition rule will apply!
+  equal: ['alertname', 'cluster', 'service']
+
+${ALERTMANAGER_EXTRA_INHIBITION}
+
+receivers:
+- name: 'admin-emails'
+  email_configs:
+  - to: '${ALERTMANAGER_ADMIN_EMAIL_RECEIVER}'
+
+${ALERTMANAGER_EXTRA_RECEIVERS}
@@ -38,6 +38,7 @@ services:
     container_name: prometheus
     volumes:
       - ./components/monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./components/monitoring/prometheus.rules:/etc/prometheus/prometheus.rules:ro
       - prometheus_persistence:/prometheus:rw
     ports:
       - 9090:9090
@@ -49,6 +50,8 @@ services:
       - --web.console.templates=/usr/share/prometheus/consoles
       # https://prometheus.io/docs/prometheus/latest/storage/
       - --storage.tsdb.retention.time=90d
+      # wrong default was http://container-hash:9090/
+      - --web.external-url=http://${PAVICS_FQDN}:9090/
     restart: unless-stopped
 
   # https://grafana.com/docs/grafana/latest/installation/docker/
@@ -68,12 +71,37 @@ services:
       - 3001:3000
     restart: unless-stopped
 
+  # https://github.com/prometheus/alertmanager
+  # https://prometheus.io/docs/alerting/latest/overview/
+  # Handle alerts: deduplicate, group, route, silence, inhibit
+  alertmanager:
+    image: prom/alertmanager:v0.21.0
+    container_name: alertmanager
+    volumes:
+      - ./components/monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+      - ./components/monitoring/alertmanager.tmpl:/etc/alertmanager/template/default.tmpl:ro
+      - alertmanager_persistence:/alertmanager:rw
+    command:
+      # restore original CMD from image
+      - --config.file=/etc/alertmanager/alertmanager.yml
+      - --storage.path=/alertmanager
+      # enable debug logging
+      - --log.level=debug
+      # wrong default was http://container-hash:9093/
+      - --web.external-url=http://${PAVICS_FQDN}:9093/
+    ports:
+      - 9093:9093
+    restart: unless-stopped
+
 volumes:
   prometheus_persistence:
     external:
       name: prometheus_persistence
   grafana_persistence:
     external:
       name: grafana_persistence
+  alertmanager_persistence:
+    external:
+      name: alertmanager_persistence
 
 # vi: tabstop=8 expandtab shiftwidth=2 softtabstop=2