diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index 8278034d05..a7fe71f53a 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -440,10 +440,14 @@ func run() int { integrationsNum += len(integrations) } - // Build the map of time interval names to mute time definitions. - muteTimes := make(map[string][]timeinterval.TimeInterval, len(conf.MuteTimeIntervals)) + // Build the map of time interval names to time interval definitions. + timeIntervals := make(map[string][]timeinterval.TimeInterval, len(conf.MuteTimeIntervals)+len(conf.TimeIntervals)) for _, ti := range conf.MuteTimeIntervals { - muteTimes[ti.Name] = ti.TimeIntervals + timeIntervals[ti.Name] = ti.TimeIntervals + } + + for _, ti := range conf.TimeIntervals { + timeIntervals[ti.Name] = ti.TimeIntervals } inhibitor.Stop() @@ -465,7 +469,7 @@ func run() int { waitFunc, inhibitor, silencer, - muteTimes, + timeIntervals, notificationLog, pipelinePeer, ) diff --git a/config/config.go b/config/config.go index 27eac43ccf..448f18532c 100644 --- a/config/config.go +++ b/config/config.go @@ -269,14 +269,34 @@ func (mt *MuteTimeInterval) UnmarshalYAML(unmarshal func(interface{}) error) err return nil } +// TimeInterval represents a named set of time intervals for which a route should be muted. +type TimeInterval struct { + Name string `yaml:"name" json:"name"` + TimeIntervals []timeinterval.TimeInterval `yaml:"time_intervals" json:"time_intervals"` +} + +// UnmarshalYAML implements the yaml.Unmarshaler interface for MuteTimeInterval. +func (ti *TimeInterval) UnmarshalYAML(unmarshal func(interface{}) error) error { + type plain TimeInterval + if err := unmarshal((*plain)(ti)); err != nil { + return err + } + if ti.Name == "" { + return fmt.Errorf("missing name in time interval") + } + return nil +} + // Config is the top-level configuration for Alertmanager's config files. type Config struct { - Global *GlobalConfig `yaml:"global,omitempty" json:"global,omitempty"` - Route *Route `yaml:"route,omitempty" json:"route,omitempty"` - InhibitRules []*InhibitRule `yaml:"inhibit_rules,omitempty" json:"inhibit_rules,omitempty"` - Receivers []*Receiver `yaml:"receivers,omitempty" json:"receivers,omitempty"` - Templates []string `yaml:"templates" json:"templates"` + Global *GlobalConfig `yaml:"global,omitempty" json:"global,omitempty"` + Route *Route `yaml:"route,omitempty" json:"route,omitempty"` + InhibitRules []*InhibitRule `yaml:"inhibit_rules,omitempty" json:"inhibit_rules,omitempty"` + Receivers []*Receiver `yaml:"receivers,omitempty" json:"receivers,omitempty"` + Templates []string `yaml:"templates" json:"templates"` + // Deprecated. Remove before v1.0 release. MuteTimeIntervals []MuteTimeInterval `yaml:"mute_time_intervals,omitempty" json:"mute_time_intervals,omitempty"` + TimeIntervals []TimeInterval `yaml:"time_intervals,omitempty" json:"time_intervals,omitempty"` // original is the input from which the config was parsed. original string @@ -491,18 +511,32 @@ func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error { return fmt.Errorf("root route must not have any mute time intervals") } + if len(c.Route.ActiveTimeIntervals) > 0 { + return fmt.Errorf("root route must not have any active time intervals") + } + // Validate that all receivers used in the routing tree are defined. if err := checkReceiver(c.Route, names); err != nil { return err } tiNames := make(map[string]struct{}) + + // read mute time intervals until deprecated for _, mt := range c.MuteTimeIntervals { if _, ok := tiNames[mt.Name]; ok { return fmt.Errorf("mute time interval %q is not unique", mt.Name) } tiNames[mt.Name] = struct{}{} } + + for _, mt := range c.TimeIntervals { + if _, ok := tiNames[mt.Name]; ok { + return fmt.Errorf("time interval %q is not unique", mt.Name) + } + tiNames[mt.Name] = struct{}{} + } + return checkTimeInterval(c.Route, tiNames) } @@ -529,12 +563,16 @@ func checkTimeInterval(r *Route, timeIntervals map[string]struct{}) error { return err } } - if len(r.MuteTimeIntervals) == 0 { - return nil + + for _, ti := range r.ActiveTimeIntervals { + if _, ok := timeIntervals[ti]; !ok { + return fmt.Errorf("undefined time interval %q used in route", ti) + } } - for _, mt := range r.MuteTimeIntervals { - if _, ok := timeIntervals[mt]; !ok { - return fmt.Errorf("undefined time interval %q used in route", mt) + + for _, tm := range r.MuteTimeIntervals { + if _, ok := timeIntervals[tm]; !ok { + return fmt.Errorf("undefined time interval %q used in route", tm) } } return nil @@ -694,11 +732,12 @@ type Route struct { // Deprecated. Remove before v1.0 release. Match map[string]string `yaml:"match,omitempty" json:"match,omitempty"` // Deprecated. Remove before v1.0 release. - MatchRE MatchRegexps `yaml:"match_re,omitempty" json:"match_re,omitempty"` - Matchers Matchers `yaml:"matchers,omitempty" json:"matchers,omitempty"` - MuteTimeIntervals []string `yaml:"mute_time_intervals,omitempty" json:"mute_time_intervals,omitempty"` - Continue bool `yaml:"continue" json:"continue,omitempty"` - Routes []*Route `yaml:"routes,omitempty" json:"routes,omitempty"` + MatchRE MatchRegexps `yaml:"match_re,omitempty" json:"match_re,omitempty"` + Matchers Matchers `yaml:"matchers,omitempty" json:"matchers,omitempty"` + MuteTimeIntervals []string `yaml:"mute_time_intervals,omitempty" json:"mute_time_intervals,omitempty"` + ActiveTimeIntervals []string `yaml:"active_time_intervals,omitempty" json:"active_time_intervals,omitempty"` + Continue bool `yaml:"continue" json:"continue,omitempty"` + Routes []*Route `yaml:"routes,omitempty" json:"routes,omitempty"` GroupWait *model.Duration `yaml:"group_wait,omitempty" json:"group_wait,omitempty"` GroupInterval *model.Duration `yaml:"group_interval,omitempty" json:"group_interval,omitempty"` diff --git a/config/config_test.go b/config/config_test.go index 23f961c4b8..73020a330c 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -177,9 +177,35 @@ receivers: } -func TestMuteTimeHasName(t *testing.T) { +func TestActiveTimeExists(t *testing.T) { in := ` -mute_time_intervals: +route: + receiver: team-Y + routes: + - match: + severity: critical + active_time_intervals: + - business_hours + +receivers: +- name: 'team-Y' +` + _, err := Load(in) + + expected := "undefined time interval \"business_hours\" used in route" + + if err == nil { + t.Fatalf("no error returned, expected:\n%q", expected) + } + if err.Error() != expected { + t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error()) + } + +} + +func TestTimeIntervalHasName(t *testing.T) { + in := ` +time_intervals: - name: time_intervals: - times: @@ -199,7 +225,7 @@ route: ` _, err := Load(in) - expected := "missing name in mute time interval" + expected := "missing name in time interval" if err == nil { t.Fatalf("no error returned, expected:\n%q", expected) @@ -358,6 +384,35 @@ route: } +func TestRootRouteNoActiveTimes(t *testing.T) { + in := ` +time_intervals: +- name: my_active_time + time_intervals: + - times: + - start_time: '09:00' + end_time: '17:00' + +receivers: +- name: 'team-X-mails' + +route: + receiver: 'team-X-mails' + active_time_intervals: + - my_active_time +` + _, err := Load(in) + + expected := "root route must not have any active time intervals" + + if err == nil { + t.Fatalf("no error returned, expected:\n%q", expected) + } + if err.Error() != expected { + t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error()) + } +} + func TestRootRouteHasNoMatcher(t *testing.T) { testCases := []struct { name string diff --git a/dispatch/dispatch.go b/dispatch/dispatch.go index 91112a5270..65d7feacb3 100644 --- a/dispatch/dispatch.go +++ b/dispatch/dispatch.go @@ -447,6 +447,7 @@ func (ag *aggrGroup) run(nf notifyFunc) { ctx = notify.WithReceiverName(ctx, ag.opts.Receiver) ctx = notify.WithRepeatInterval(ctx, ag.opts.RepeatInterval) ctx = notify.WithMuteTimeIntervals(ctx, ag.opts.MuteTimeIntervals) + ctx = notify.WithActiveTimeIntervals(ctx, ag.opts.ActiveTimeIntervals) // Wait the configured interval before calling flush again. ag.mtx.Lock() diff --git a/dispatch/route.go b/dispatch/route.go index f892f264b2..643aa6dae4 100644 --- a/dispatch/route.go +++ b/dispatch/route.go @@ -118,6 +118,7 @@ func NewRoute(cr *config.Route, parent *Route) *Route { sort.Sort(matchers) opts.MuteTimeIntervals = cr.MuteTimeIntervals + opts.ActiveTimeIntervals = cr.ActiveTimeIntervals route := &Route{ parent: parent, @@ -210,6 +211,9 @@ type RouteOpts struct { // A list of time intervals for which the route is muted. MuteTimeIntervals []string + + // A list of time intervals for which the route is active. + ActiveTimeIntervals []string } func (ro *RouteOpts) String() string { diff --git a/docs/configuration.md b/docs/configuration.md index 50a269bc50..b2f0cf5b5f 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -116,9 +116,14 @@ receivers: inhibit_rules: [ - ... ] +# DEPRECATED: use time_intervals below. # A list of mute time intervals for muting routes. mute_time_intervals: [ - ... ] + +# A list of time intervals for muting/activating routes. +time_intervals: + [ - ... ] ``` ## `` @@ -189,6 +194,16 @@ matchers: mute_time_intervals: [ - ...] +# Times when the route should be active. These must match the name of a +# time interval defined in the time_intervals section. An empty value +# means that the route is always active. +# Additionally, the root node cannot have any active times. +# The route will send notifications only when active, but otherwise +# acts normally (including ending the route-matching process +# if the `continue` option is not set). +active_time_intervals: + [ - ...] + # Zero or more child routes. routes: [ - ... ] @@ -221,12 +236,32 @@ route: group_by: [product, environment] matchers: - team="frontend" + + # All alerts with the service=inhouse-service label match this sub-route. + # the route will be muted during offhours and holidays time intervals. + # even if it matches, it will continue to the next sub-route + - receiver: 'dev-pager' + matchers: + - service="inhouse-service" + mute_time_intervals: + - offhours + - holidays + continue: true + + # All alerts with the service=inhouse-service label match this sub-route + # the route will be active only during offhours and holidays time intervals. + - receiver: 'on-call-pager' + matchers: + - service="inhouse-service" + active_time_intervals: + - offhours + - holidays ``` -## `` +## `` -A `mute_time_interval` specifies a named interval of time that may be referenced -in the routing tree to mute particular routes for particular times of the day. +A `time_interval` specifies a named interval of time that may be referenced +in the routing tree to mute/activate particular routes for particular times of the day. ```yaml name: diff --git a/notify/notify.go b/notify/notify.go index 5e9e63305e..bbdc6e7b01 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -115,6 +115,7 @@ const ( keyResolvedAlerts keyNow keyMuteTimeIntervals + keyActiveTimeIntervals ) // WithReceiverName populates a context with a receiver name. @@ -157,6 +158,10 @@ func WithMuteTimeIntervals(ctx context.Context, mt []string) context.Context { return context.WithValue(ctx, keyMuteTimeIntervals, mt) } +func WithActiveTimeIntervals(ctx context.Context, at []string) context.Context { + return context.WithValue(ctx, keyActiveTimeIntervals, at) +} + // RepeatInterval extracts a repeat interval from the context. Iff none exists, the // second argument is false. func RepeatInterval(ctx context.Context) (time.Duration, bool) { @@ -206,13 +211,20 @@ func ResolvedAlerts(ctx context.Context) ([]uint64, bool) { return v, ok } -// MuteTimeIntervalNames extracts a slice of mute time names from the context. Iff none exists, the +// MuteTimeIntervalNames extracts a slice of mute time names from the context. If and only if none exists, the // second argument is false. func MuteTimeIntervalNames(ctx context.Context) ([]string, bool) { v, ok := ctx.Value(keyMuteTimeIntervals).([]string) return v, ok } +// ActiveTimeIntervalNames extracts a slice of active time names from the context. If none exists, the +// second argument is false. +func ActiveTimeIntervalNames(ctx context.Context) ([]string, bool) { + v, ok := ctx.Value(keyActiveTimeIntervals).([]string) + return v, ok +} + // A Stage processes alerts under the constraints of the given context. type Stage interface { Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) @@ -310,7 +322,7 @@ func (pb *PipelineBuilder) New( wait func() time.Duration, inhibitor *inhibit.Inhibitor, silencer *silence.Silencer, - muteTimes map[string][]timeinterval.TimeInterval, + times map[string][]timeinterval.TimeInterval, notificationLog NotificationLog, peer Peer, ) RoutingStage { @@ -319,11 +331,12 @@ func (pb *PipelineBuilder) New( ms := NewGossipSettleStage(peer) is := NewMuteStage(inhibitor) ss := NewMuteStage(silencer) - tms := NewTimeMuteStage(muteTimes) + tms := NewTimeMuteStage(times) + tas := NewTimeActiveStage(times) for name := range receivers { st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics) - rs[name] = MultiStage{ms, is, tms, ss, st} + rs[name] = MultiStage{ms, is, tas, tms, ss, st} } return rs } @@ -775,12 +788,14 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*typ return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved) } -type TimeMuteStage struct { - muteTimes map[string][]timeinterval.TimeInterval +type timeStage struct { + Times map[string][]timeinterval.TimeInterval } -func NewTimeMuteStage(mt map[string][]timeinterval.TimeInterval) *TimeMuteStage { - return &TimeMuteStage{mt} +type TimeMuteStage timeStage + +func NewTimeMuteStage(ti map[string][]timeinterval.TimeInterval) *TimeMuteStage { + return &TimeMuteStage{ti} } // Exec implements the stage interface for TimeMuteStage. @@ -795,20 +810,11 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type return ctx, alerts, errors.New("missing now timestamp") } - muted := false -Loop: - for _, mtName := range muteTimeIntervalNames { - mt, ok := tms.muteTimes[mtName] - if !ok { - return ctx, alerts, errors.Errorf("mute time %s doesn't exist in config", mtName) - } - for _, ti := range mt { - if ti.ContainsTime(now.UTC()) { - muted = true - break Loop - } - } + muted, err := inTimeIntervals(now, tms.Times, muteTimeIntervalNames) + if err != nil { + return ctx, alerts, err } + // If the current time is inside a mute time, all alerts are removed from the pipeline. if muted { level.Debug(l).Log("msg", "Notifications not sent, route is within mute time") @@ -816,3 +822,57 @@ Loop: } return ctx, alerts, nil } + +type TimeActiveStage timeStage + +func NewTimeActiveStage(ti map[string][]timeinterval.TimeInterval) *TimeActiveStage { + return &TimeActiveStage{ti} +} + +// Exec implements the stage interface for TimeActiveStage. +// TimeActiveStage is responsible for muting alerts whose route is not in an active time. +func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) { + activeTimeIntervalNames, ok := ActiveTimeIntervalNames(ctx) + if !ok { + return ctx, alerts, nil + } + + // if we don't have active time intervals at all it is always active. + if len(activeTimeIntervalNames) == 0 { + return ctx, alerts, nil + } + + now, ok := Now(ctx) + if !ok { + return ctx, alerts, errors.New("missing now timestamp") + } + + active, err := inTimeIntervals(now, tas.Times, activeTimeIntervalNames) + if err != nil { + return ctx, alerts, err + } + + // If the current time is not inside an active time, all alerts are removed from the pipeline + if !active { + level.Debug(l).Log("msg", "Notifications not sent, route is not within active time") + return ctx, nil, nil + } + + return ctx, alerts, nil +} + +// inTimeIntervals returns true if the current time is contained in one of the given time intervals. +func inTimeIntervals(now time.Time, intervals map[string][]timeinterval.TimeInterval, intervalNames []string) (bool, error) { + for _, name := range intervalNames { + interval, ok := intervals[name] + if !ok { + return false, errors.Errorf("time interval %s doesn't exist in config", name) + } + for _, ti := range interval { + if ti.ContainsTime(now.UTC()) { + return true, nil + } + } + } + return false, nil +} diff --git a/notify/notify_test.go b/notify/notify_test.go index b120fd670d..d93aa53c8c 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -724,7 +724,7 @@ func TestMuteStageWithSilences(t *testing.T) { } func TestTimeMuteStage(t *testing.T) { - // Route mutes alerts outside business hours. + // Route mutes alerts outside business hours if it is a mute_time_interval muteIn := ` --- - weekdays: ['monday:friday'] @@ -800,6 +800,7 @@ func TestTimeMuteStage(t *testing.T) { alerts := []*types.Alert{{Alert: a}} ctx := context.Background() ctx = WithNow(ctx, now) + ctx = WithActiveTimeIntervals(ctx, []string{}) ctx = WithMuteTimeIntervals(ctx, []string{"test"}) _, out, err := stage.Exec(ctx, log.NewNopLogger(), alerts...) @@ -818,6 +819,90 @@ func TestTimeMuteStage(t *testing.T) { } } +func TestTimeActiveStage(t *testing.T) { + // Route mutes alerts inside business hours if it is an active time interval + muteIn := ` +--- +- weekdays: ['monday:friday'] + times: + - start_time: '00:00' + end_time: '09:00' + - start_time: '17:00' + end_time: '24:00' +- weekdays: ['saturday', 'sunday']` + + cases := []struct { + fireTime string + labels model.LabelSet + shouldMute bool + }{ + { + // Friday during business hours + fireTime: "01 Jan 21 09:00 +0000", + labels: model.LabelSet{"mute": "me"}, + shouldMute: true, + }, + { + // Tuesday before 5pm + fireTime: "01 Dec 20 16:59 +0000", + labels: model.LabelSet{"mute": "me"}, + shouldMute: true, + }, + { + // Saturday + fireTime: "17 Oct 20 10:00 +0000", + labels: model.LabelSet{"foo": "bar"}, + shouldMute: false, + }, + { + // Wednesday before 9am + fireTime: "14 Oct 20 05:00 +0000", + labels: model.LabelSet{"dont": "mute"}, + shouldMute: false, + }, + } + var intervals []timeinterval.TimeInterval + err := yaml.Unmarshal([]byte(muteIn), &intervals) + if err != nil { + t.Fatalf("Couldn't unmarshal time interval %s", err) + } + m := map[string][]timeinterval.TimeInterval{"test": intervals} + stage := NewTimeActiveStage(m) + + outAlerts := []*types.Alert{} + nonMuteCount := 0 + for _, tc := range cases { + now, err := time.Parse(time.RFC822Z, tc.fireTime) + if err != nil { + t.Fatalf("Couldn't parse fire time %s %s", tc.fireTime, err) + } + // Count alerts with shouldMute == false and compare to ensure none are muted incorrectly + if !tc.shouldMute { + nonMuteCount++ + } + a := model.Alert{Labels: tc.labels} + alerts := []*types.Alert{{Alert: a}} + ctx := context.Background() + ctx = WithNow(ctx, now) + ctx = WithActiveTimeIntervals(ctx, []string{"test"}) + ctx = WithMuteTimeIntervals(ctx, []string{}) + + _, out, err := stage.Exec(ctx, log.NewNopLogger(), alerts...) + if err != nil { + t.Fatalf("Unexpected error in time mute stage %s", err) + } + outAlerts = append(outAlerts, out...) + } + for _, alert := range outAlerts { + if _, ok := alert.Alert.Labels["mute"]; ok { + t.Fatalf("Expected alert to be muted %+v", alert.Alert) + } + } + if len(outAlerts) != nonMuteCount { + t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts)) + } +} + func BenchmarkHashAlert(b *testing.B) { alert := &types.Alert{ Alert: model.Alert{