diff --git a/cmd/bosun/conf/conf.go b/cmd/bosun/conf/conf.go index 44ca588360..750adc1f73 100644 --- a/cmd/bosun/conf/conf.go +++ b/cmd/bosun/conf/conf.go @@ -50,6 +50,7 @@ type SystemConfProvider interface { GetCheckFrequency() time.Duration GetDefaultRunEvery() int + GetAlertCheckDistribution() string GetUnknownThreshold() int GetMinGroupSize() int diff --git a/cmd/bosun/conf/system.go b/cmd/bosun/conf/system.go index 8776d5ba06..697b8bbe06 100644 --- a/cmd/bosun/conf/system.go +++ b/cmd/bosun/conf/system.go @@ -32,9 +32,10 @@ type SystemConf struct { InternetProxy string MinGroupSize int - UnknownThreshold int - CheckFrequency Duration // Time between alert checks: 5m - DefaultRunEvery int // Default number of check intervals to run each alert: 1 + UnknownThreshold int + CheckFrequency Duration // Time between alert checks: 5m + DefaultRunEvery int // Default number of check intervals to run each alert: 1 + AlertCheckDistribution string // Method to distribute alet checks. No distribution if equals "" DBConf DBConf @@ -223,10 +224,11 @@ const ( // NewSystemConf retruns a system conf with default values set func newSystemConf() *SystemConf { return &SystemConf{ - Scheme: "http", - CheckFrequency: Duration{Duration: time.Minute * 5}, - DefaultRunEvery: 1, - HTTPListen: defaultHTTPListen, + Scheme: "http", + CheckFrequency: Duration{Duration: time.Minute * 5}, + DefaultRunEvery: 1, + HTTPListen: defaultHTTPListen, + AlertCheckDistribution: "", DBConf: DBConf{ LedisDir: "ledis_data", LedisBindAddr: "127.0.0.1:9565", @@ -269,6 +271,10 @@ func loadSystemConfig(conf string, isFileName bool) (*SystemConf, error) { return sc, fmt.Errorf("undecoded fields in system configuration: %v", decodeMeta.Undecoded()) } + if sc.GetAlertCheckDistribution() != "" && sc.GetAlertCheckDistribution() != "simple" { + return sc, fmt.Errorf("invalid value %v for AlertCheckDistribution", sc.GetAlertCheckDistribution()) + } + // iterate over each hosts for hostPrefix, value := range sc.ElasticConf { if value.SimpleClient && value.ClientOptions.Enabled { @@ -401,6 +407,11 @@ func (sc *SystemConf) GetDefaultRunEvery() int { return sc.DefaultRunEvery } +// GetAlertCheckDistribution returns if the alert rule checks are scattered over check period +func (sc *SystemConf) GetAlertCheckDistribution() string { + return sc.AlertCheckDistribution +} + // GetUnknownThreshold returns the threshold in which multiple unknown alerts in a check iteration // should be grouped into a single notification func (sc *SystemConf) GetUnknownThreshold() int { diff --git a/cmd/bosun/sched/alertRunner.go b/cmd/bosun/sched/alertRunner.go index 9d2d3afec8..0866c56a9d 100644 --- a/cmd/bosun/sched/alertRunner.go +++ b/cmd/bosun/sched/alertRunner.go @@ -19,8 +19,14 @@ func (s *Schedule) Run() error { type alertCh struct { ch chan<- *checkContext modulo int + shift int // used to distribute alert runs } chs := []alertCh{} + + // Every alert gets a small shift in time. + // This way the alerts with the same period are not fired + // simultaneously, but are distributed. + circular_shifts := make(map[int]int) // the map is *run period* -> *time shift to add* for _, a := range s.RuleConf.GetAlerts() { ch := make(chan *checkContext, 1) re := a.RunEvery @@ -28,7 +34,16 @@ func (s *Schedule) Run() error { re = s.SystemConf.GetDefaultRunEvery() } go s.runAlert(a, ch) - chs = append(chs, alertCh{ch: ch, modulo: re}) + + if s.SystemConf.GetAlertCheckDistribution() == "simple" { // only apply shifts if the respective option is set + chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]}) + } else { + // there are no shifts if option is off + chs = append(chs, alertCh{ch: ch, modulo: re, shift: 0}) + } + + // the shifts for a given period range 0..(period - 1) + circular_shifts[re] = (circular_shifts[re] + 1) % re } i := 0 for { @@ -40,7 +55,7 @@ func (s *Schedule) Run() error { ctx := &checkContext{utcNow(), cache.New(0)} s.LastCheck = utcNow() for _, a := range chs { - if i%a.modulo != 0 { + if (i+a.shift)%a.modulo != 0 { continue } // Put on channel. If that fails, the alert is backed up pretty bad. diff --git a/docs/system_configuration.md b/docs/system_configuration.md index b7d5125053..a784ac0a73 100644 --- a/docs/system_configuration.md +++ b/docs/system_configuration.md @@ -103,6 +103,13 @@ frequent as every "1m", and others that run less often (any multiple of Example: `DefaultRunEvery = 5` +### AlertCheckDistribution +Selects algorithm to distribute alert checks to decrease system load spikes. There is no distribution by default. This means, if there are several checks with same period, they all will happen at the same points in time. This method is used if the option is not specified or equals to empty string. + +The single alternative option is `simple`. If specified, the alert checks with the same period will be uniformly distributed on second marks. + +Example: `AlertCheckDistribution = "simple"` + ### RuleFilePath Path to the file containing definitions of alerts, macros, lookups, templates, notifications, and global variables which are [documented