diff --git a/.changelog/11127.txt b/.changelog/11127.txt new file mode 100644 index 000000000000..245a4f8ef7eb --- /dev/null +++ b/.changelog/11127.txt @@ -0,0 +1,3 @@ +```release-note:improvement +server: Allow tuning of node failover heartbeat TTL +``` diff --git a/command/agent/agent.go b/command/agent/agent.go index a500dea7f431..1943fb68289f 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -380,6 +380,9 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) { if maxHPS := agentConfig.Server.MaxHeartbeatsPerSecond; maxHPS != 0 { conf.MaxHeartbeatsPerSecond = maxHPS } + if failoverTTL := agentConfig.Server.FailoverHeartbeatTTL; failoverTTL != 0 { + conf.FailoverHeartbeatTTL = failoverTTL + } if *agentConfig.Consul.AutoAdvertise && agentConfig.Consul.ServerServiceName == "" { return nil, fmt.Errorf("server_service_name must be set when auto_advertise is enabled") diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 39f8380b37e6..0182ca26429a 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -141,6 +141,11 @@ func TestAgent_ServerConfig(t *testing.T) { require.NoError(t, err) require.Equal(t, float64(11.0), out.MaxHeartbeatsPerSecond) + conf.Server.FailoverHeartbeatTTL = 337 * time.Second + out, err = a.serverConfig() + require.NoError(t, err) + require.Equal(t, 337*time.Second, out.FailoverHeartbeatTTL) + // Defaults to the global bind addr conf.Addresses.RPC = "" conf.Addresses.Serf = "" diff --git a/command/agent/config.go b/command/agent/config.go index 687b3f03748e..439fdebfb849 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -441,6 +441,12 @@ type ServerConfig struct { // to meet the target rate. MaxHeartbeatsPerSecond float64 `hcl:"max_heartbeats_per_second"` + // FailoverHeartbeatTTL is the TTL applied to heartbeats after + // a new leader is elected, since we no longer know the status + // of all the heartbeats. + FailoverHeartbeatTTL time.Duration + FailoverHeartbeatTTLHCL string `hcl:"failover_heartbeat_ttl" json:"-"` + // StartJoin is a list of addresses to attempt to join when the // agent starts. If Serf is unable to communicate with any of these // addresses, then the agent will error and exit. @@ -1484,6 +1490,12 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig { if b.MaxHeartbeatsPerSecond != 0.0 { result.MaxHeartbeatsPerSecond = b.MaxHeartbeatsPerSecond } + if b.FailoverHeartbeatTTL != 0 { + result.FailoverHeartbeatTTL = b.FailoverHeartbeatTTL + } + if b.FailoverHeartbeatTTLHCL != "" { + result.FailoverHeartbeatTTLHCL = b.FailoverHeartbeatTTLHCL + } if b.RetryMaxAttempts != 0 { result.RetryMaxAttempts = b.RetryMaxAttempts } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 319e7ef195cb..b745835711df 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -55,6 +55,7 @@ func ParseConfigFile(path string) (*Config, error) { {"client.server_join.retry_interval", &c.Client.ServerJoin.RetryInterval, &c.Client.ServerJoin.RetryIntervalHCL}, {"server.heartbeat_grace", &c.Server.HeartbeatGrace, &c.Server.HeartbeatGraceHCL}, {"server.min_heartbeat_ttl", &c.Server.MinHeartbeatTTL, &c.Server.MinHeartbeatTTLHCL}, + {"server.failover_heartbeat_ttl", &c.Server.FailoverHeartbeatTTL, &c.Server.FailoverHeartbeatTTLHCL}, {"server.retry_interval", &c.Server.RetryInterval, &c.Server.RetryIntervalHCL}, {"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL}, {"consul.timeout", &c.Consul.Timeout, &c.Consul.TimeoutHCL}, diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 9f6d66a69186..5ee1305012f9 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -112,6 +112,8 @@ var basicConfig = &Config{ MinHeartbeatTTL: 33 * time.Second, MinHeartbeatTTLHCL: "33s", MaxHeartbeatsPerSecond: 11.0, + FailoverHeartbeatTTL: 330 * time.Second, + FailoverHeartbeatTTLHCL: "330s", RetryJoin: []string{"1.1.1.1", "2.2.2.2"}, StartJoin: []string{"1.1.1.1", "2.2.2.2"}, RetryInterval: 15 * time.Second, diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index b4a55197d572..c28cdfd938fe 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -120,6 +120,7 @@ server { heartbeat_grace = "30s" min_heartbeat_ttl = "33s" max_heartbeats_per_second = 11.0 + failover_heartbeat_ttl = "330s" retry_join = ["1.1.1.1", "2.2.2.2"] start_join = ["1.1.1.1", "2.2.2.2"] retry_max = 3 diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index 02de2490c4c3..a92d7748d503 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -273,6 +273,7 @@ "job_gc_threshold": "12h", "max_heartbeats_per_second": 11, "min_heartbeat_ttl": "33s", + "failover_heartbeat_ttl": "330s", "node_gc_threshold": "12h", "non_voting_server": true, "num_schedulers": 2, diff --git a/website/content/docs/configuration/server.mdx b/website/content/docs/configuration/server.mdx index aa9763f774fe..b8b4647d2d06 100644 --- a/website/content/docs/configuration/server.mdx +++ b/website/content/docs/configuration/server.mdx @@ -131,6 +131,16 @@ server { a tradeoff as it lowers failure detection time of nodes at the tradeoff of false positives and increased load on the leader. +- `failover_heartbeat_ttl` `(string: "5m")` - Specifies the TTL applied to + heartbeats after a new leader is elected, since we no longer know the status + of all the heartbeats. This is specified using a label suffix like "30s" or + "1h". + + ~> Lowering the `failover_heartbeat_ttl` is a tradeoff as it lowers failure + detection time of nodes at the tradeoff of false positives. False positives + could cause all clients to stop their allocations if a leadership transition + lasts longer than `heartbeat_grace + failover_heartbeat_ttl`. + - `max_heartbeats_per_second` `(float: 50.0)` - Specifies the maximum target rate of heartbeats being processed per second. This allows the TTL to be increased to meet the target rate. Increasing the maximum heartbeats per