From bb45b95bc4eff522129e4b6c41b505d52b6f9790 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Wed, 19 Jul 2017 09:38:35 -0700 Subject: [PATCH] Allow tuning of heartbeat ttls This PR allows tuning of heartbeat TTLs. An example of very aggressive settings is as follows: ``` server { heartbeat_grace = "1s" min_heartbeat_ttl = "1s" max_heartbeats_per_second = 200.0 } ``` --- command/agent/agent.go | 14 +++--- command/agent/agent_test.go | 22 +++++----- command/agent/config-test-fixtures/basic.hcl | 2 + command/agent/config.go | 19 +++++++- command/agent/config_parse.go | 12 ++++- command/agent/config_parse_test.go | 36 ++++++++------- command/agent/config_test.go | 44 ++++++++++--------- .../docs/agent/configuration/server.html.md | 17 +++++++ 8 files changed, 109 insertions(+), 57 deletions(-) diff --git a/command/agent/agent.go b/command/agent/agent.go index 27c0f07f3fba..5fc211174dc0 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -192,12 +192,14 @@ func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Confi conf.DeploymentGCThreshold = dur } - if heartbeatGrace := agentConfig.Server.HeartbeatGrace; heartbeatGrace != "" { - dur, err := time.ParseDuration(heartbeatGrace) - if err != nil { - return nil, err - } - conf.HeartbeatGrace = dur + if heartbeatGrace := agentConfig.Server.HeartbeatGrace; heartbeatGrace != 0 { + conf.HeartbeatGrace = heartbeatGrace + } + if min := agentConfig.Server.MinHeartbeatTTL; min != 0 { + conf.MinHeartbeatTTL = min + } + if maxHPS := agentConfig.Server.MaxHeartbeatsPerSecond; maxHPS != 0 { + conf.MaxHeartbeatsPerSecond = maxHPS } if *agentConfig.Consul.AutoAdvertise && agentConfig.Consul.ServerServiceName == "" { diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index a06cc51dbd0e..b7af086f29bf 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -233,22 +233,22 @@ func TestAgent_ServerConfig(t *testing.T) { t.Fatalf("expect 10s, got: %s", threshold) } - conf.Server.HeartbeatGrace = "42g" - if err := conf.normalizeAddrs(); err != nil { - t.Fatalf("error normalizing config: %v", err) - } + conf.Server.HeartbeatGrace = 37 * time.Second out, err = a.serverConfig() - if err == nil || !strings.Contains(err.Error(), "unknown unit") { - t.Fatalf("expected unknown unit error, got: %#v", err) + if threshold := out.HeartbeatGrace; threshold != time.Second*37 { + t.Fatalf("expect 37s, got: %s", threshold) } - conf.Server.HeartbeatGrace = "37s" - if err := conf.normalizeAddrs(); err != nil { - t.Fatalf("error normalizing config: %v", err) + conf.Server.MinHeartbeatTTL = 37 * time.Second + out, err = a.serverConfig() + if min := out.MinHeartbeatTTL; min != time.Second*37 { + t.Fatalf("expect 37s, got: %s", min) } + + conf.Server.MaxHeartbeatsPerSecond = 11.0 out, err = a.serverConfig() - if threshold := out.HeartbeatGrace; threshold != time.Second*37 { - t.Fatalf("expect 37s, got: %s", threshold) + if max := out.MaxHeartbeatsPerSecond; max != 11.0 { + t.Fatalf("expect 11, got: %s", max) } // Defaults to the global bind addr diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index faf2474e2422..00ffdb10e7fa 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -73,6 +73,8 @@ server { eval_gc_threshold = "12h" deployment_gc_threshold = "12h" heartbeat_grace = "30s" + min_heartbeat_ttl = "33s" + max_heartbeats_per_second = 11.0 retry_join = [ "1.1.1.1", "2.2.2.2" ] start_join = [ "1.1.1.1", "2.2.2.2" ] retry_max = 3 diff --git a/command/agent/config.go b/command/agent/config.go index b533a245b4f9..36769cb30016 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -277,7 +277,16 @@ type ServerConfig struct { // HeartbeatGrace is the grace period beyond the TTL to account for network, // processing delays and clock skew before marking a node as "down". - HeartbeatGrace string `mapstructure:"heartbeat_grace"` + HeartbeatGrace time.Duration `mapstructure:"heartbeat_grace"` + + // MinHeartbeatTTL is the minimum time between heartbeats. This is used as + // a floor to prevent excessive updates. + MinHeartbeatTTL time.Duration `mapstructure:"min_heartbeat_ttl"` + + // MaxHeartbeatsPerSecond is the maximum target rate of heartbeats + // being processed per second. This allows the TTL to be increased + // to meet the target rate. + MaxHeartbeatsPerSecond float64 `mapstructure:"max_heartbeats_per_second"` // StartJoin is a list of addresses to attempt to join when the // agent starts. If Serf is unable to communicate with any of these @@ -924,9 +933,15 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig { if b.DeploymentGCThreshold != "" { result.DeploymentGCThreshold = b.DeploymentGCThreshold } - if b.HeartbeatGrace != "" { + if b.HeartbeatGrace != 0 { result.HeartbeatGrace = b.HeartbeatGrace } + if b.MinHeartbeatTTL != 0 { + result.MinHeartbeatTTL = b.MinHeartbeatTTL + } + if b.MaxHeartbeatsPerSecond != 0.0 { + result.MaxHeartbeatsPerSecond = b.MaxHeartbeatsPerSecond + } if b.RetryMaxAttempts != 0 { result.RetryMaxAttempts = b.RetryMaxAttempts } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index f8011527b5b8..de1401cb56d2 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -506,6 +506,8 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error { "job_gc_threshold", "deployment_gc_threshold", "heartbeat_grace", + "min_heartbeat_ttl", + "max_heartbeats_per_second", "start_join", "retry_join", "retry_max", @@ -523,7 +525,15 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error { } var config ServerConfig - if err := mapstructure.WeakDecode(m, &config); err != nil { + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: &config, + }) + if err != nil { + return err + } + if err := dec.Decode(m); err != nil { return err } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 7d9b969cc119..8caaaf4a0b04 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -82,23 +82,25 @@ func TestConfig_Parse(t *testing.T) { NoHostUUID: helper.BoolToPtr(false), }, Server: &ServerConfig{ - Enabled: true, - BootstrapExpect: 5, - DataDir: "/tmp/data", - ProtocolVersion: 3, - NumSchedulers: 2, - EnabledSchedulers: []string{"test"}, - NodeGCThreshold: "12h", - EvalGCThreshold: "12h", - JobGCThreshold: "12h", - DeploymentGCThreshold: "12h", - HeartbeatGrace: "30s", - RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, - StartJoin: []string{"1.1.1.1", "2.2.2.2"}, - RetryInterval: "15s", - RejoinAfterLeave: true, - RetryMaxAttempts: 3, - EncryptKey: "abc", + Enabled: true, + BootstrapExpect: 5, + DataDir: "/tmp/data", + ProtocolVersion: 3, + NumSchedulers: 2, + EnabledSchedulers: []string{"test"}, + NodeGCThreshold: "12h", + EvalGCThreshold: "12h", + JobGCThreshold: "12h", + DeploymentGCThreshold: "12h", + HeartbeatGrace: 30 * time.Second, + MinHeartbeatTTL: 33 * time.Second, + MaxHeartbeatsPerSecond: 11.0, + RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, + StartJoin: []string{"1.1.1.1", "2.2.2.2"}, + RetryInterval: "15s", + RejoinAfterLeave: true, + RetryMaxAttempts: 3, + EncryptKey: "abc", }, Telemetry: &Telemetry{ StatsiteAddr: "127.0.0.1:1234", diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 4f56807a8b5f..cdfa36baad18 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -90,13 +90,15 @@ func TestConfig_Merge(t *testing.T) { }, }, Server: &ServerConfig{ - Enabled: false, - BootstrapExpect: 1, - DataDir: "/tmp/data1", - ProtocolVersion: 1, - NumSchedulers: 1, - NodeGCThreshold: "1h", - HeartbeatGrace: "30s", + Enabled: false, + BootstrapExpect: 1, + DataDir: "/tmp/data1", + ProtocolVersion: 1, + NumSchedulers: 1, + NodeGCThreshold: "1h", + HeartbeatGrace: 30 * time.Second, + MinHeartbeatTTL: 30 * time.Second, + MaxHeartbeatsPerSecond: 30.0, }, Ports: &Ports{ HTTP: 4646, @@ -220,19 +222,21 @@ func TestConfig_Merge(t *testing.T) { GCInodeUsageThreshold: 86, }, Server: &ServerConfig{ - Enabled: true, - BootstrapExpect: 2, - DataDir: "/tmp/data2", - ProtocolVersion: 2, - NumSchedulers: 2, - EnabledSchedulers: []string{structs.JobTypeBatch}, - NodeGCThreshold: "12h", - HeartbeatGrace: "2m", - RejoinAfterLeave: true, - StartJoin: []string{"1.1.1.1"}, - RetryJoin: []string{"1.1.1.1"}, - RetryInterval: "10s", - retryInterval: time.Second * 10, + Enabled: true, + BootstrapExpect: 2, + DataDir: "/tmp/data2", + ProtocolVersion: 2, + NumSchedulers: 2, + EnabledSchedulers: []string{structs.JobTypeBatch}, + NodeGCThreshold: "12h", + HeartbeatGrace: 2 * time.Minute, + MinHeartbeatTTL: 2 * time.Minute, + MaxHeartbeatsPerSecond: 200.0, + RejoinAfterLeave: true, + StartJoin: []string{"1.1.1.1"}, + RetryJoin: []string{"1.1.1.1"}, + RetryInterval: "10s", + retryInterval: time.Second * 10, }, Ports: &Ports{ HTTP: 20000, diff --git a/website/source/docs/agent/configuration/server.html.md b/website/source/docs/agent/configuration/server.html.md index ba7f1bcd2f6d..1e55aedc4629 100644 --- a/website/source/docs/agent/configuration/server.html.md +++ b/website/source/docs/agent/configuration/server.html.md @@ -80,6 +80,23 @@ server { deployment must be in the terminal state before it is eligible for garbage collection. This is specified using a label suffix like "30s" or "1h". +- `heartbeat_grace` `(string: "10s")` - Specifies the additional time given as a + grace period beyond the heartbeat TTL of nodes to account for network and + processing delays as well as clock skew. This is specified using a label + suffix like "30s" or "1h". + +- `min_heartbeat_ttl` `(string: "10s")` - Specifies the minimum time between + node heartbeats. This is used as a floor to prevent excessive updates. This is + specified using a label suffix like "30s" or "1h". Lowering the minimum TTL is + a tradeoff as it lowers failure detection time of nodes at the tradeoff of + false positives and increased load on the leader. + +- `max_heartbeats_per_second` `(float: 50.0)` - Specifies the maximum target + rate of heartbeats being processed per second. This allows the TTL to be + increased to meet the target rate. Increasing the maximum heartbeats per + second is a tradeoff as it lowers failure detection time of nodes at the + tradeoff of false positives and increased load on the leader. + - `num_schedulers` `(int: [num-cores])` - Specifies the number of parallel scheduler threads to run. This can be as many as one per core, or `0` to disallow this server from making any scheduling decisions. This defaults to