Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds performance tuning capability for Raft, detuned defaults, and supplemental docs. #2303

Merged
merged 6 commits into from
Aug 25, 2016
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,11 @@ func (a *Agent) consulConfig() *consul.Config {
// Apply dev mode
base.DevMode = a.config.DevMode

// Apply performance factors
if a.config.Performance.RaftMultiplier > 0 {
base.ScaleRaft(a.config.Performance.RaftMultiplier)
}

// Override with our config
if a.config.Datacenter != "" {
base.Datacenter = a.config.Datacenter
Expand Down
41 changes: 41 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/hashicorp/consul/consul"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/raft"
)

const (
Expand Down Expand Up @@ -191,6 +192,46 @@ func TestAgent_CheckAdvertiseAddrsSettings(t *testing.T) {
}
}

func TestAgent_CheckPerformanceSettings(t *testing.T) {
// Try a default config.
{
c := nextConfig()
c.ConsulConfig = nil
dir, agent := makeAgent(t, c)
defer os.RemoveAll(dir)
defer agent.Shutdown()

raftMult := time.Duration(consul.DefaultRaftMultiplier)
r := agent.consulConfig().RaftConfig
def := raft.DefaultConfig()
if r.HeartbeatTimeout != raftMult*def.HeartbeatTimeout ||
r.ElectionTimeout != raftMult*def.ElectionTimeout ||
r.CommitTimeout != raftMult*def.CommitTimeout ||
r.LeaderLeaseTimeout != raftMult*def.LeaderLeaseTimeout {
t.Fatalf("bad: %#v", *r)
}
}

// Try a multiplier.
{
c := nextConfig()
c.Performance.RaftMultiplier = 99
dir, agent := makeAgent(t, c)
defer os.RemoveAll(dir)
defer agent.Shutdown()

const raftMult time.Duration = 99
r := agent.consulConfig().RaftConfig
def := raft.DefaultConfig()
if r.HeartbeatTimeout != raftMult*def.HeartbeatTimeout ||
r.ElectionTimeout != raftMult*def.ElectionTimeout ||
r.CommitTimeout != raftMult*def.CommitTimeout ||
r.LeaderLeaseTimeout != raftMult*def.LeaderLeaseTimeout {
t.Fatalf("bad: %#v", *r)
}
}
}

func TestAgent_ReconnectConfigSettings(t *testing.T) {
c := nextConfig()
func() {
Expand Down
18 changes: 17 additions & 1 deletion command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,14 @@ type DNSConfig struct {
DisableCompression bool `mapstructure:"disable_compression"`
}

// Performance is used to tune the performance of Consul's subsystems.
type Performance struct {
// RaftMultiplier is an integer multiplier used to scale Raft timing
// parameters: HeartbeatTimeout, ElectionTimeout, CommitTimeout, and
// LeaderLeaseTimeout.
RaftMultiplier uint `mapstructure:"raft_multiplier"`
}

// Telemetry is the telemetry configuration for the server
type Telemetry struct {
// StatsiteAddr is the address of a statsite instance. If provided,
Expand Down Expand Up @@ -205,10 +213,13 @@ type Telemetry struct {
// Some of this is configurable as CLI flags, but most must
// be set using a configuration file.
type Config struct {
// DevMode enables a fast-path mode of opertaion to bring up an in-memory
// DevMode enables a fast-path mode of operation to bring up an in-memory
// server with minimal configuration. Useful for developing Consul.
DevMode bool `mapstructure:"-"`

// Performance is used to tune the performance of Consul's subsystems.
Performance Performance `mapstructure:"performance"`

// Bootstrap is used to bring up the first Consul server, and
// permits that node to elect itself leader
Bootstrap bool `mapstructure:"bootstrap"`
Expand Down Expand Up @@ -1085,6 +1096,11 @@ func DecodeCheckDefinition(raw interface{}) (*CheckDefinition, error) {
func MergeConfig(a, b *Config) *Config {
var result Config = *a

// Propagate non-default performance settings
if b.Performance.RaftMultiplier > 0 {
result.Performance.RaftMultiplier = b.Performance.RaftMultiplier
}

// Copy the strings if they're set
if b.Bootstrap {
result.Bootstrap = true
Expand Down
14 changes: 14 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,17 @@ func TestDecodeConfig_invalidKeys(t *testing.T) {
}
}

func TestDecodeConfig_Performance(t *testing.T) {
input := `{"performance": { "raft_multiplier": 3 }}`
config, err := DecodeConfig(bytes.NewReader([]byte(input)))
if err != nil {
t.Fatalf("err: %s", err)
}
if config.Performance.RaftMultiplier != 3 {
t.Fatalf("bad: multiplier isn't set: %#v", config)
}
}

func TestDecodeConfig_Services(t *testing.T) {
input := `{
"services": [
Expand Down Expand Up @@ -1382,6 +1393,9 @@ func TestMergeConfig(t *testing.T) {
}

b := &Config{
Performance: Performance{
RaftMultiplier: 99,
},
Bootstrap: true,
BootstrapExpect: 3,
Datacenter: "dc2",
Expand Down
26 changes: 24 additions & 2 deletions consul/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ const (
DefaultDC = "dc1"
DefaultLANSerfPort = 8301
DefaultWANSerfPort = 8302

// See docs/guides/performance.html for information on how this value
// was obtained.
DefaultRaftMultiplier uint = 5
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add a sanity check for a MaxRaftMultiplier as well

)

var (
Expand Down Expand Up @@ -314,8 +318,11 @@ func DefaultConfig() *Config {
CoordinateUpdateBatchSize: 128,
CoordinateUpdateMaxBatches: 5,

// Hold an RPC for up to 5 seconds by default
RPCHoldTimeout: 5 * time.Second,
// This holds RPCs during leader elections. For the default Raft
// config the election timeout is 5 seconds, so we set this a
// bit longer to try to cover that period. This should be more
// than enough when running in the high performance mode.
RPCHoldTimeout: 7 * time.Second,
}

// Increase our reap interval to 3 days instead of 24h.
Expand All @@ -333,13 +340,28 @@ func DefaultConfig() *Config {
// Enable interoperability with unversioned Raft library, and don't
// start using new ID-based features yet.
conf.RaftConfig.ProtocolVersion = 1
conf.ScaleRaft(DefaultRaftMultiplier)

// Disable shutdown on removal
conf.RaftConfig.ShutdownOnRemove = false

return conf
}

// ScaleRaft sets the config to have Raft timing parameters scaled by the given
// performance multiplier. This is done in an idempotent way so it's not tricky
// to call this when composing configurations and potentially calling this
// multiple times on the same structure.
func (c *Config) ScaleRaft(raftMultRaw uint) {
raftMult := time.Duration(raftMultRaw)

def := raft.DefaultConfig()
c.RaftConfig.HeartbeatTimeout = raftMult * def.HeartbeatTimeout
c.RaftConfig.ElectionTimeout = raftMult * def.ElectionTimeout
c.RaftConfig.CommitTimeout = raftMult * def.CommitTimeout
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to scale this? This won't affect stability but affects the commit tail latency on followers

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh good catch - I'll remove this.

c.RaftConfig.LeaderLeaseTimeout = raftMult * def.LeaderLeaseTimeout
}

func (c *Config) tlsConfig() *tlsutil.Config {
tlsConf := &tlsutil.Config{
VerifyIncoming: c.VerifyIncoming,
Expand Down
11 changes: 6 additions & 5 deletions consul/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -820,11 +820,12 @@ func (s *Server) Stats() map[string]map[string]string {
s.remoteLock.RUnlock()
stats := map[string]map[string]string{
"consul": map[string]string{
"server": "true",
"leader": fmt.Sprintf("%v", s.IsLeader()),
"leader_addr": string(s.raft.Leader()),
"bootstrap": fmt.Sprintf("%v", s.config.Bootstrap),
"known_datacenters": toString(uint64(numKnownDCs)),
"server": "true",
"leader": fmt.Sprintf("%v", s.IsLeader()),
"leader_addr": string(s.raft.Leader()),
"bootstrap": fmt.Sprintf("%v", s.config.Bootstrap),
"known_datacenters": toString(uint64(numKnownDCs)),
"leader_lease_timeout": fmt.Sprintf("%v", s.config.RaftConfig.LeaderLeaseTimeout),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems odd to expose this since it is static?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yeah I was testing and using this as a sanity check but I'll remove it.

},
"raft": s.raft.Stats(),
"serf_lan": s.serfLAN.Stats(),
Expand Down
47 changes: 28 additions & 19 deletions testutil/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ import (
// offset is used to atomically increment the port numbers.
var offset uint64

// TestPerformanceConfig configures the performance parameters.
type TestPerformanceConfig struct {
RaftMultiplier uint `json:"raft_multiplier,omitempty"`
}

// TestPortConfig configures the various ports used for services
// provided by the Consul server.
type TestPortConfig struct {
Expand All @@ -51,20 +56,21 @@ type TestAddressConfig struct {

// TestServerConfig is the main server configuration struct.
type TestServerConfig struct {
NodeName string `json:"node_name"`
Bootstrap bool `json:"bootstrap,omitempty"`
Server bool `json:"server,omitempty"`
DataDir string `json:"data_dir,omitempty"`
Datacenter string `json:"datacenter,omitempty"`
DisableCheckpoint bool `json:"disable_update_check"`
LogLevel string `json:"log_level,omitempty"`
Bind string `json:"bind_addr,omitempty"`
Addresses *TestAddressConfig `json:"addresses,omitempty"`
Ports *TestPortConfig `json:"ports,omitempty"`
ACLMasterToken string `json:"acl_master_token,omitempty"`
ACLDatacenter string `json:"acl_datacenter,omitempty"`
ACLDefaultPolicy string `json:"acl_default_policy,omitempty"`
Stdout, Stderr io.Writer `json:"-"`
NodeName string `json:"node_name"`
Performance *TestPerformanceConfig `json:"performance,omitempty"`
Bootstrap bool `json:"bootstrap,omitempty"`
Server bool `json:"server,omitempty"`
DataDir string `json:"data_dir,omitempty"`
Datacenter string `json:"datacenter,omitempty"`
DisableCheckpoint bool `json:"disable_update_check"`
LogLevel string `json:"log_level,omitempty"`
Bind string `json:"bind_addr,omitempty"`
Addresses *TestAddressConfig `json:"addresses,omitempty"`
Ports *TestPortConfig `json:"ports,omitempty"`
ACLMasterToken string `json:"acl_master_token,omitempty"`
ACLDatacenter string `json:"acl_datacenter,omitempty"`
ACLDefaultPolicy string `json:"acl_default_policy,omitempty"`
Stdout, Stderr io.Writer `json:"-"`
}

// ServerConfigCallback is a function interface which can be
Expand All @@ -79,11 +85,14 @@ func defaultServerConfig() *TestServerConfig {
return &TestServerConfig{
NodeName: fmt.Sprintf("node%d", idx),
DisableCheckpoint: true,
Bootstrap: true,
Server: true,
LogLevel: "debug",
Bind: "127.0.0.1",
Addresses: &TestAddressConfig{},
Performance: &TestPerformanceConfig{
RaftMultiplier: 1,
},
Bootstrap: true,
Server: true,
LogLevel: "debug",
Bind: "127.0.0.1",
Addresses: &TestAddressConfig{},
Ports: &TestPortConfig{
DNS: 20000 + idx,
HTTP: 21000 + idx,
Expand Down
18 changes: 18 additions & 0 deletions website/source/docs/agent/options.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,24 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
* <a name="node_name"></a><a href="#node_name">`node_name`</a> Equivalent to the
[`-node` command-line flag](#_node).

* <a name="performance"></a><a href="#performance">`performance`</a> Available in Consul 0.7 and
later, this is a nested object that allows tuning the performance of different subsystems in
Consul. See the [Server Performance](/docs/guides/performance.html) guide for more details. The
following parameters are available:
* <a name="raft_multiplier"></a><a href="#raft_multiplier">`raft_multiplier`</a> - An integer
multiplier used by Consul servers to scale key Raft timing parameters. Tuning this affects
the time it takes Consul to detect leader failures and to perform leader elections, at the
expense of requiring more network and CPU resources for better performance.<br><br>A value
of 0, the default, means that Consul will use a lower-performance timing that's suitable for
[minimal Consul servers](/docs/guides/performance.html#minumum), currently equivalent to
setting this to a value of 5 (this default may be changed in future versions of Consul,
depending if the target minimum server profile changes). Above 0, higher values imply lower
levels of performance. Setting this to a value of 1 will configure Raft to its
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Above 0, higher values imply lower levels of performance." is confusing. Consider rephrasing to "The zero value uses the default, lower values are used to tighten timing and increase sensitivity while higher values relax timings and reduce sensitivity."

highest-performance mode, equivalent to the default timing of Consul prior to 0.7, and is
recommended for [production Consul servers](/docs/guides/performance.html#production). See
the note on [last contact](/docs/guides/performance.html#last-contact) timing for more
details on tuning this parameter.

* <a name="ports"></a><a href="#ports">`ports`</a> This is a nested object that allows setting
the bind ports for the following keys:
* <a name="dns_port"></a><a href="#dns_port">`dns`</a> - The DNS server, -1 to disable. Default 8600.
Expand Down
4 changes: 2 additions & 2 deletions website/source/docs/agent/telemetry.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ These metrics are used to monitor the health of the Consul servers.
<td>timer</td>
</tr>
<tr>
<td>`consul.raft.leader.lastContact`</td>
<td>This measures the time that a Consul server was last contacted by the leader (will be zero on the leader itself). This is a general indicator of latency in the Raft subsystem, and gives a general indicator of how far behind [stale](/docs/agent/http.html#consistency) queries will be.</td>
<td><a name="last-contact"></a>`consul.raft.leader.lastContact`</td>
<td>This will only be emitted by the Raft leader and measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.<br><br>The lease timeout is 500 ms times the [`raft_multiplier` configuration](/docs/agent/options.html#raft_multiplier), so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the [Server Performance](/docs/guides/performance.html) guide for more details.</td>
<td>ms</td>
<td>timer</td>
</tr>
Expand Down
2 changes: 2 additions & 0 deletions website/source/docs/guides/dns-cache.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ for each lookup and can potentially exhaust the query throughput of a cluster.
For this reason, Consul provides a number of tuning parameters that can
customize how DNS queries are handled.

<a name="stale"></a>
## Stale Reads

Stale reads can be used to reduce latency and increase the throughput
Expand Down Expand Up @@ -60,6 +61,7 @@ client and Consul and set the cache values appropriately. In many cases
"appropriately" simply is turning negative response caching off to get the best
recovery time when a service becomes available again.

<a name="ttl"></a>
## TTL Values

TTL values can be set to allow DNS results to be cached downstream of Consul. Higher
Expand Down
Loading