-
Notifications
You must be signed in to change notification settings - Fork 7
/
conf-example.yml
112 lines (112 loc) · 3.82 KB
/
conf-example.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
---
cluster_variables:
warning_notifications: "@slack"
rulesets:
- type: deployment
match_annotations:
- name: astro/owner
value: astro
monitors:
deploy-replica-alert:
name: "Deployment Replica Alert - {{ .ObjectMeta.Name }}"
type: metric alert
query: "max(last_10m):max:kubernetes_state.deployment.replicas_available{namespace:{{ .ObjectMeta.Namespace }}} by {deployment} <= 0"
message: |-
{{ "{{#is_alert}}" }}
Available replicas is currently 0 for {{ .ObjectMeta.Name }}
{{ "{{/is_alert}}" }}
{{ "{{^is_alert}}" }}
Available replicas is no longer 0 for {{ .ObjectMeta.Name }}
{{ "{{/is_alert}}" }}
tags: []
options:
no_data_timeframe: 60
notify_audit: false
notify_no_data: false
renotify_interval: 5
new_host_delay: 5
evaluation_delay: 300
timeout_h: 1
escalation_message: ""
thresholds:
critical: 0
require_full_window: true
locked: false
- type: binding
bound_objects:
- deployment
match_annotations:
- name: astro/admin
value: fairwinds
monitors:
bound-deploy-replica-alert:
name: "Deployment Replica Alert - {{ .ObjectMeta.Name }}"
type: metric alert
query: "max(last_10m):max:kubernetes_state.deployment.replicas_available{deployment:{{ .ObjectMeta.Name }}} <= 0"
message: |-
{{ "{{#is_alert}}" }}
Available replicas is currently 0 for {{ "{{deployment.name}}" }}
{{ "{{/is_alert}}" }}
{{ "{{^is_alert}}" }}
Available replicas is no longer 0 for {{ "{{deployment.name}}" }}
{{ "{{/is_alert}}" }}
{{ ClusterVariables.warning_notifications }}
tags: []
options:
notify_audit: false
notify_no_data: false
new_host_delay: 300
thresholds:
critical: 0
locked: false
- type: namespace
match_annotations:
- name: astro/admin-bound
value: "true"
monitors:
ns-pending-pods:
name: "Pending Pods - {{ .ObjectMeta.Name }}"
type: query alert
query: "min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running,namespace:{{ .ObjectMeta.Name }}} - sum:kubernetes_state.pod.status_phase{phase:running,namespace:{{ .ObjectMeta.Name }}} + sum:kubernetes_state.pod.status_phase{phase:pending,namespace:{{ .ObjectMeta.Name }}}.fill(zero) >= 1"
message: |-
{{ "{{#is_alert}}" }}
There has been at least 1 pod Pending for 30 minutes.
There are currently {{ "{{value}}" }} pods Pending.
- Is something crash-looping?
- Is autoscaling adding node capacity where needed?
- Is a secret or a configmap missing?
{{ "{{/is_alert}}" }}
{{ "{{^is_alert}}" }}
Pods are no longer pending.
{{ "{{/is_alert}}" }}
tags: []
options:
notify_audit: false
notify_no_data: false
new_host_delay: 300
thresholds:
critical: 1.0
locked: false
- type: namespace
match_annotations:
- name: astro/admin
value: fairwinds
monitors:
ns-increased-pod-crash:
name: "Increased Pod Crashes - {{ .ObjectMeta.Name }}"
type: query alert
query: "avg(last_5m):avg:kubernetes_state.container.restarts{namespace:{{ .ObjectMeta.Name }}} by {pod} - hour_before(avg:kubernetes_state.container.restarts{namespace:{{ .ObjectMeta.Name }}} by {pod}) > 3"
message: |-
{{ "{{#is_alert}}" }}
{{ "{{pod.name}}" }} has crashed repeatedly over the last hour
{{ "{{/is_alert}}" }}
{{ "{{^is_alert}}" }}
{{ "{{pod.name}}" }} appears to have stopped crashing
{{ "{{/is_alert}}" }}
tags: []
options:
notify_audit: false
notify_no_data: false
thresholds:
critical: 3
locked: false