From 475ca88009f1e8d2d394fa1f2d9b85f0afb8962f Mon Sep 17 00:00:00 2001 From: Roman Grytskiv Date: Tue, 8 May 2018 01:43:02 +0200 Subject: [PATCH 1/6] Added alert timing distribution --- cmd/bosun/sched/alertRunner.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmd/bosun/sched/alertRunner.go b/cmd/bosun/sched/alertRunner.go index 9d2d3afec8..59919dbc2f 100644 --- a/cmd/bosun/sched/alertRunner.go +++ b/cmd/bosun/sched/alertRunner.go @@ -19,8 +19,10 @@ func (s *Schedule) Run() error { type alertCh struct { ch chan<- *checkContext modulo int + shift int // distribute alert runs } chs := []alertCh{} + circular_shifts := make(map[int]int) for _, a := range s.RuleConf.GetAlerts() { ch := make(chan *checkContext, 1) re := a.RunEvery @@ -28,7 +30,8 @@ func (s *Schedule) Run() error { re = s.SystemConf.GetDefaultRunEvery() } go s.runAlert(a, ch) - chs = append(chs, alertCh{ch: ch, modulo: re}) + chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]}) + circular_shifts[re] = (circular_shifts[re] + 1) % re } i := 0 for { @@ -40,7 +43,8 @@ func (s *Schedule) Run() error { ctx := &checkContext{utcNow(), cache.New(0)} s.LastCheck = utcNow() for _, a := range chs { - if i%a.modulo != 0 { + slog.Infof(">>> Tick!!! i: %v, mod: %v, sh: %v\n", i, a.modulo, a.shift) + if (i+a.shift)%a.modulo != 0 { continue } // Put on channel. If that fails, the alert is backed up pretty bad. From 86a542b3a9c3d0406b886d6e821e8d698048bfca Mon Sep 17 00:00:00 2001 From: Roman Grytskiv Date: Wed, 9 May 2018 14:58:36 +0200 Subject: [PATCH 2/6] Clean-up. --- cmd/bosun/sched/alertRunner.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cmd/bosun/sched/alertRunner.go b/cmd/bosun/sched/alertRunner.go index 59919dbc2f..d5f4bed555 100644 --- a/cmd/bosun/sched/alertRunner.go +++ b/cmd/bosun/sched/alertRunner.go @@ -19,9 +19,13 @@ func (s *Schedule) Run() error { type alertCh struct { ch chan<- *checkContext modulo int - shift int // distribute alert runs + shift int // used to distribute alert runs } chs := []alertCh{} + + // Every alert gets a small shift in time. + // This way the alerts with the same period are not fired + // simultaneously, but are distributed. circular_shifts := make(map[int]int) for _, a := range s.RuleConf.GetAlerts() { ch := make(chan *checkContext, 1) @@ -31,6 +35,8 @@ func (s *Schedule) Run() error { } go s.runAlert(a, ch) chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]}) + + // the shifts for a given period range 0..(period - 1) circular_shifts[re] = (circular_shifts[re] + 1) % re } i := 0 @@ -43,7 +49,6 @@ func (s *Schedule) Run() error { ctx := &checkContext{utcNow(), cache.New(0)} s.LastCheck = utcNow() for _, a := range chs { - slog.Infof(">>> Tick!!! i: %v, mod: %v, sh: %v\n", i, a.modulo, a.shift) if (i+a.shift)%a.modulo != 0 { continue } From ee0db452406d919cf11872b2d1bf1799e8f2dce2 Mon Sep 17 00:00:00 2001 From: Roman Grytskiv Date: Wed, 9 May 2018 16:38:26 +0200 Subject: [PATCH 3/6] Added a comment about the shifts map --- cmd/bosun/sched/alertRunner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/bosun/sched/alertRunner.go b/cmd/bosun/sched/alertRunner.go index d5f4bed555..52cb6f1e11 100644 --- a/cmd/bosun/sched/alertRunner.go +++ b/cmd/bosun/sched/alertRunner.go @@ -26,7 +26,7 @@ func (s *Schedule) Run() error { // Every alert gets a small shift in time. // This way the alerts with the same period are not fired // simultaneously, but are distributed. - circular_shifts := make(map[int]int) + circular_shifts := make(map[int]int) // the map is *run period* -> *time shift to add* for _, a := range s.RuleConf.GetAlerts() { ch := make(chan *checkContext, 1) re := a.RunEvery From 56440f676372ee6b4ddc87761c0a52499e4cb2db Mon Sep 17 00:00:00 2001 From: Roman Grytskiv Date: Tue, 15 May 2018 17:36:59 +0200 Subject: [PATCH 4/6] Added AlertCheckDistribution configuration option for toggling alert check scattering. --- cmd/bosun/conf/conf.go | 1 + cmd/bosun/conf/system.go | 21 ++++++++++++++------- cmd/bosun/sched/alertRunner.go | 8 +++++++- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/cmd/bosun/conf/conf.go b/cmd/bosun/conf/conf.go index 44ca588360..649b8c17f5 100644 --- a/cmd/bosun/conf/conf.go +++ b/cmd/bosun/conf/conf.go @@ -50,6 +50,7 @@ type SystemConfProvider interface { GetCheckFrequency() time.Duration GetDefaultRunEvery() int + GetAlertCheckDistribution() bool GetUnknownThreshold() int GetMinGroupSize() int diff --git a/cmd/bosun/conf/system.go b/cmd/bosun/conf/system.go index 8776d5ba06..1034b9e905 100644 --- a/cmd/bosun/conf/system.go +++ b/cmd/bosun/conf/system.go @@ -32,9 +32,10 @@ type SystemConf struct { InternetProxy string MinGroupSize int - UnknownThreshold int - CheckFrequency Duration // Time between alert checks: 5m - DefaultRunEvery int // Default number of check intervals to run each alert: 1 + UnknownThreshold int + CheckFrequency Duration // Time between alert checks: 5m + DefaultRunEvery int // Default number of check intervals to run each alert: 1 + AlertCheckDistribution bool // Should the alert rule checks be scattered across their running period? DBConf DBConf @@ -223,10 +224,11 @@ const ( // NewSystemConf retruns a system conf with default values set func newSystemConf() *SystemConf { return &SystemConf{ - Scheme: "http", - CheckFrequency: Duration{Duration: time.Minute * 5}, - DefaultRunEvery: 1, - HTTPListen: defaultHTTPListen, + Scheme: "http", + CheckFrequency: Duration{Duration: time.Minute * 5}, + DefaultRunEvery: 1, + HTTPListen: defaultHTTPListen, + AlertCheckDistribution: false, DBConf: DBConf{ LedisDir: "ledis_data", LedisBindAddr: "127.0.0.1:9565", @@ -401,6 +403,11 @@ func (sc *SystemConf) GetDefaultRunEvery() int { return sc.DefaultRunEvery } +// GetAlertCheckDistribution returns if the alert rule checks are scattered over check period +func (sc *SystemConf) GetAlertCheckDistribution() bool { + return sc.AlertCheckDistribution +} + // GetUnknownThreshold returns the threshold in which multiple unknown alerts in a check iteration // should be grouped into a single notification func (sc *SystemConf) GetUnknownThreshold() int { diff --git a/cmd/bosun/sched/alertRunner.go b/cmd/bosun/sched/alertRunner.go index 52cb6f1e11..778b518c63 100644 --- a/cmd/bosun/sched/alertRunner.go +++ b/cmd/bosun/sched/alertRunner.go @@ -34,7 +34,13 @@ func (s *Schedule) Run() error { re = s.SystemConf.GetDefaultRunEvery() } go s.runAlert(a, ch) - chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]}) + + if s.SystemConf.GetAlertCheckDistribution() { // only apply shifts if the respective option is enabled + chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]}) + } else { + // there are no shifts if option is off + chs = append(chs, alertCh{ch: ch, modulo: re, shift: 0}) + } // the shifts for a given period range 0..(period - 1) circular_shifts[re] = (circular_shifts[re] + 1) % re From 699250a4ff413db8bfbe7deb05facfd4a7a65317 Mon Sep 17 00:00:00 2001 From: Roman Grytskiv Date: Wed, 30 May 2018 14:51:37 +0200 Subject: [PATCH 5/6] Changed AlertCheckDistribution option to string --- cmd/bosun/conf/conf.go | 2 +- cmd/bosun/conf/system.go | 10 +++++++--- cmd/bosun/sched/alertRunner.go | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cmd/bosun/conf/conf.go b/cmd/bosun/conf/conf.go index 649b8c17f5..750adc1f73 100644 --- a/cmd/bosun/conf/conf.go +++ b/cmd/bosun/conf/conf.go @@ -50,7 +50,7 @@ type SystemConfProvider interface { GetCheckFrequency() time.Duration GetDefaultRunEvery() int - GetAlertCheckDistribution() bool + GetAlertCheckDistribution() string GetUnknownThreshold() int GetMinGroupSize() int diff --git a/cmd/bosun/conf/system.go b/cmd/bosun/conf/system.go index 1034b9e905..697b8bbe06 100644 --- a/cmd/bosun/conf/system.go +++ b/cmd/bosun/conf/system.go @@ -35,7 +35,7 @@ type SystemConf struct { UnknownThreshold int CheckFrequency Duration // Time between alert checks: 5m DefaultRunEvery int // Default number of check intervals to run each alert: 1 - AlertCheckDistribution bool // Should the alert rule checks be scattered across their running period? + AlertCheckDistribution string // Method to distribute alet checks. No distribution if equals "" DBConf DBConf @@ -228,7 +228,7 @@ func newSystemConf() *SystemConf { CheckFrequency: Duration{Duration: time.Minute * 5}, DefaultRunEvery: 1, HTTPListen: defaultHTTPListen, - AlertCheckDistribution: false, + AlertCheckDistribution: "", DBConf: DBConf{ LedisDir: "ledis_data", LedisBindAddr: "127.0.0.1:9565", @@ -271,6 +271,10 @@ func loadSystemConfig(conf string, isFileName bool) (*SystemConf, error) { return sc, fmt.Errorf("undecoded fields in system configuration: %v", decodeMeta.Undecoded()) } + if sc.GetAlertCheckDistribution() != "" && sc.GetAlertCheckDistribution() != "simple" { + return sc, fmt.Errorf("invalid value %v for AlertCheckDistribution", sc.GetAlertCheckDistribution()) + } + // iterate over each hosts for hostPrefix, value := range sc.ElasticConf { if value.SimpleClient && value.ClientOptions.Enabled { @@ -404,7 +408,7 @@ func (sc *SystemConf) GetDefaultRunEvery() int { } // GetAlertCheckDistribution returns if the alert rule checks are scattered over check period -func (sc *SystemConf) GetAlertCheckDistribution() bool { +func (sc *SystemConf) GetAlertCheckDistribution() string { return sc.AlertCheckDistribution } diff --git a/cmd/bosun/sched/alertRunner.go b/cmd/bosun/sched/alertRunner.go index 778b518c63..0866c56a9d 100644 --- a/cmd/bosun/sched/alertRunner.go +++ b/cmd/bosun/sched/alertRunner.go @@ -35,7 +35,7 @@ func (s *Schedule) Run() error { } go s.runAlert(a, ch) - if s.SystemConf.GetAlertCheckDistribution() { // only apply shifts if the respective option is enabled + if s.SystemConf.GetAlertCheckDistribution() == "simple" { // only apply shifts if the respective option is set chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]}) } else { // there are no shifts if option is off From a5c3c67401fb811e454edfb69ea92806d78b35f8 Mon Sep 17 00:00:00 2001 From: Roman Grytskiv Date: Fri, 8 Jun 2018 11:34:22 +0200 Subject: [PATCH 6/6] Added documentation for AlertCheckDistribution system config option --- docs/system_configuration.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/system_configuration.md b/docs/system_configuration.md index b7d5125053..a784ac0a73 100644 --- a/docs/system_configuration.md +++ b/docs/system_configuration.md @@ -103,6 +103,13 @@ frequent as every "1m", and others that run less often (any multiple of Example: `DefaultRunEvery = 5` +### AlertCheckDistribution +Selects algorithm to distribute alert checks to decrease system load spikes. There is no distribution by default. This means, if there are several checks with same period, they all will happen at the same points in time. This method is used if the option is not specified or equals to empty string. + +The single alternative option is `simple`. If specified, the alert checks with the same period will be uniformly distributed on second marks. + +Example: `AlertCheckDistribution = "simple"` + ### RuleFilePath Path to the file containing definitions of alerts, macros, lookups, templates, notifications, and global variables which are [documented