Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow tuning of heartbeat ttls #2859

Merged
merged 1 commit into from
Jul 19, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,14 @@ func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Confi
conf.DeploymentGCThreshold = dur
}

if heartbeatGrace := agentConfig.Server.HeartbeatGrace; heartbeatGrace != "" {
dur, err := time.ParseDuration(heartbeatGrace)
if err != nil {
return nil, err
}
conf.HeartbeatGrace = dur
if heartbeatGrace := agentConfig.Server.HeartbeatGrace; heartbeatGrace != 0 {
conf.HeartbeatGrace = heartbeatGrace
}
if min := agentConfig.Server.MinHeartbeatTTL; min != 0 {
conf.MinHeartbeatTTL = min
}
if maxHPS := agentConfig.Server.MaxHeartbeatsPerSecond; maxHPS != 0 {
conf.MaxHeartbeatsPerSecond = maxHPS
}

if *agentConfig.Consul.AutoAdvertise && agentConfig.Consul.ServerServiceName == "" {
Expand Down
22 changes: 11 additions & 11 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,22 +233,22 @@ func TestAgent_ServerConfig(t *testing.T) {
t.Fatalf("expect 10s, got: %s", threshold)
}

conf.Server.HeartbeatGrace = "42g"
if err := conf.normalizeAddrs(); err != nil {
t.Fatalf("error normalizing config: %v", err)
}
conf.Server.HeartbeatGrace = 37 * time.Second
out, err = a.serverConfig()
if err == nil || !strings.Contains(err.Error(), "unknown unit") {
t.Fatalf("expected unknown unit error, got: %#v", err)
if threshold := out.HeartbeatGrace; threshold != time.Second*37 {
t.Fatalf("expect 37s, got: %s", threshold)
}

conf.Server.HeartbeatGrace = "37s"
if err := conf.normalizeAddrs(); err != nil {
t.Fatalf("error normalizing config: %v", err)
conf.Server.MinHeartbeatTTL = 37 * time.Second
out, err = a.serverConfig()
if min := out.MinHeartbeatTTL; min != time.Second*37 {
t.Fatalf("expect 37s, got: %s", min)
}

conf.Server.MaxHeartbeatsPerSecond = 11.0
out, err = a.serverConfig()
if threshold := out.HeartbeatGrace; threshold != time.Second*37 {
t.Fatalf("expect 37s, got: %s", threshold)
if max := out.MaxHeartbeatsPerSecond; max != 11.0 {
t.Fatalf("expect 11, got: %s", max)
}

// Defaults to the global bind addr
Expand Down
2 changes: 2 additions & 0 deletions command/agent/config-test-fixtures/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ server {
eval_gc_threshold = "12h"
deployment_gc_threshold = "12h"
heartbeat_grace = "30s"
min_heartbeat_ttl = "33s"
max_heartbeats_per_second = 11.0
retry_join = [ "1.1.1.1", "2.2.2.2" ]
start_join = [ "1.1.1.1", "2.2.2.2" ]
retry_max = 3
Expand Down
19 changes: 17 additions & 2 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,16 @@ type ServerConfig struct {

// HeartbeatGrace is the grace period beyond the TTL to account for network,
// processing delays and clock skew before marking a node as "down".
HeartbeatGrace string `mapstructure:"heartbeat_grace"`
HeartbeatGrace time.Duration `mapstructure:"heartbeat_grace"`

// MinHeartbeatTTL is the minimum time between heartbeats. This is used as
// a floor to prevent excessive updates.
MinHeartbeatTTL time.Duration `mapstructure:"min_heartbeat_ttl"`

// MaxHeartbeatsPerSecond is the maximum target rate of heartbeats
// being processed per second. This allows the TTL to be increased
// to meet the target rate.
MaxHeartbeatsPerSecond float64 `mapstructure:"max_heartbeats_per_second"`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why float?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thats what the underlying server config uses.


// StartJoin is a list of addresses to attempt to join when the
// agent starts. If Serf is unable to communicate with any of these
Expand Down Expand Up @@ -924,9 +933,15 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
if b.DeploymentGCThreshold != "" {
result.DeploymentGCThreshold = b.DeploymentGCThreshold
}
if b.HeartbeatGrace != "" {
if b.HeartbeatGrace != 0 {
result.HeartbeatGrace = b.HeartbeatGrace
}
if b.MinHeartbeatTTL != 0 {
result.MinHeartbeatTTL = b.MinHeartbeatTTL
}
if b.MaxHeartbeatsPerSecond != 0.0 {
result.MaxHeartbeatsPerSecond = b.MaxHeartbeatsPerSecond
}
if b.RetryMaxAttempts != 0 {
result.RetryMaxAttempts = b.RetryMaxAttempts
}
Expand Down
12 changes: 11 additions & 1 deletion command/agent/config_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
"job_gc_threshold",
"deployment_gc_threshold",
"heartbeat_grace",
"min_heartbeat_ttl",
"max_heartbeats_per_second",
"start_join",
"retry_join",
"retry_max",
Expand All @@ -523,7 +525,15 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
}

var config ServerConfig
if err := mapstructure.WeakDecode(m, &config); err != nil {
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
WeaklyTypedInput: true,
Result: &config,
})
if err != nil {
return err
}
if err := dec.Decode(m); err != nil {
return err
}

Expand Down
36 changes: 19 additions & 17 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,23 +82,25 @@ func TestConfig_Parse(t *testing.T) {
NoHostUUID: helper.BoolToPtr(false),
},
Server: &ServerConfig{
Enabled: true,
BootstrapExpect: 5,
DataDir: "/tmp/data",
ProtocolVersion: 3,
NumSchedulers: 2,
EnabledSchedulers: []string{"test"},
NodeGCThreshold: "12h",
EvalGCThreshold: "12h",
JobGCThreshold: "12h",
DeploymentGCThreshold: "12h",
HeartbeatGrace: "30s",
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
StartJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: "15s",
RejoinAfterLeave: true,
RetryMaxAttempts: 3,
EncryptKey: "abc",
Enabled: true,
BootstrapExpect: 5,
DataDir: "/tmp/data",
ProtocolVersion: 3,
NumSchedulers: 2,
EnabledSchedulers: []string{"test"},
NodeGCThreshold: "12h",
EvalGCThreshold: "12h",
JobGCThreshold: "12h",
DeploymentGCThreshold: "12h",
HeartbeatGrace: 30 * time.Second,
MinHeartbeatTTL: 33 * time.Second,
MaxHeartbeatsPerSecond: 11.0,
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
StartJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: "15s",
RejoinAfterLeave: true,
RetryMaxAttempts: 3,
EncryptKey: "abc",
},
Telemetry: &Telemetry{
StatsiteAddr: "127.0.0.1:1234",
Expand Down
44 changes: 24 additions & 20 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,15 @@ func TestConfig_Merge(t *testing.T) {
},
},
Server: &ServerConfig{
Enabled: false,
BootstrapExpect: 1,
DataDir: "/tmp/data1",
ProtocolVersion: 1,
NumSchedulers: 1,
NodeGCThreshold: "1h",
HeartbeatGrace: "30s",
Enabled: false,
BootstrapExpect: 1,
DataDir: "/tmp/data1",
ProtocolVersion: 1,
NumSchedulers: 1,
NodeGCThreshold: "1h",
HeartbeatGrace: 30 * time.Second,
MinHeartbeatTTL: 30 * time.Second,
MaxHeartbeatsPerSecond: 30.0,
},
Ports: &Ports{
HTTP: 4646,
Expand Down Expand Up @@ -220,19 +222,21 @@ func TestConfig_Merge(t *testing.T) {
GCInodeUsageThreshold: 86,
},
Server: &ServerConfig{
Enabled: true,
BootstrapExpect: 2,
DataDir: "/tmp/data2",
ProtocolVersion: 2,
NumSchedulers: 2,
EnabledSchedulers: []string{structs.JobTypeBatch},
NodeGCThreshold: "12h",
HeartbeatGrace: "2m",
RejoinAfterLeave: true,
StartJoin: []string{"1.1.1.1"},
RetryJoin: []string{"1.1.1.1"},
RetryInterval: "10s",
retryInterval: time.Second * 10,
Enabled: true,
BootstrapExpect: 2,
DataDir: "/tmp/data2",
ProtocolVersion: 2,
NumSchedulers: 2,
EnabledSchedulers: []string{structs.JobTypeBatch},
NodeGCThreshold: "12h",
HeartbeatGrace: 2 * time.Minute,
MinHeartbeatTTL: 2 * time.Minute,
MaxHeartbeatsPerSecond: 200.0,
RejoinAfterLeave: true,
StartJoin: []string{"1.1.1.1"},
RetryJoin: []string{"1.1.1.1"},
RetryInterval: "10s",
retryInterval: time.Second * 10,
},
Ports: &Ports{
HTTP: 20000,
Expand Down
17 changes: 17 additions & 0 deletions website/source/docs/agent/configuration/server.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,23 @@ server {
deployment must be in the terminal state before it is eligible for garbage
collection. This is specified using a label suffix like "30s" or "1h".

- `heartbeat_grace` `(string: "10s")` - Specifies the additional time given as a
grace period beyond the heartbeat TTL of nodes to account for network and
processing delays as well as clock skew. This is specified using a label
suffix like "30s" or "1h".

- `min_heartbeat_ttl` `(string: "10s")` - Specifies the minimum time between
node heartbeats. This is used as a floor to prevent excessive updates. This is
specified using a label suffix like "30s" or "1h". Lowering the minimum TTL is
a tradeoff as it lowers failure detection time of nodes at the tradeoff of
false positives and increased load on the leader.

- `max_heartbeats_per_second` `(float: 50.0)` - Specifies the maximum target
rate of heartbeats being processed per second. This allows the TTL to be
increased to meet the target rate. Increasing the maximum heartbeats per
second is a tradeoff as it lowers failure detection time of nodes at the
tradeoff of false positives and increased load on the leader.

- `num_schedulers` `(int: [num-cores])` - Specifies the number of parallel
scheduler threads to run. This can be as many as one per core, or `0` to
disallow this server from making any scheduling decisions. This defaults to
Expand Down