Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

Commit

Permalink
support overriding SWIM (memberlist) performance tuneables
Browse files Browse the repository at this point in the history
  • Loading branch information
Dieterbe committed Dec 1, 2017
1 parent 7f40597 commit ae3c5da
Show file tree
Hide file tree
Showing 9 changed files with 366 additions and 1 deletion.
42 changes: 42 additions & 0 deletions cluster/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,23 @@ var (
httpTimeout time.Duration
minAvailableShards int

swimUseConfig string
swimTCPTimeout time.Duration
swimIndirectChecks int
swimRetransmitMult int
swimSuspicionMult int
swimSuspicionMaxTimeoutMult int
swimPushPullInterval time.Duration
swimProbeInterval time.Duration
swimProbeTimeout time.Duration
swimDisableTcpPings bool
swimAwarenessMaxMultiplier int
swimGossipInterval time.Duration
swimGossipNodes int
swimGossipToTheDeadTime time.Duration
swimEnableCompression bool
swimDNSConfigPath string

client http.Client
)

Expand All @@ -37,6 +54,25 @@ func ConfigSetup() {
clusterCfg.IntVar(&maxPrio, "max-priority", 10, "maximum priority before a node should be considered not-ready.")
clusterCfg.IntVar(&minAvailableShards, "min-available-shards", 0, "minimum number of shards that must be available for a query to be handled.")
globalconf.Register("cluster", clusterCfg)

swimCfg := flag.NewFlagSet("swim", flag.ExitOnError)
swimCfg.StringVar(&swimUseConfig, "use-config", "default-lan", "config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. see https://godoc.org/github.com/hashicorp/memberlist#Config . Note all our swim settings correspond to default-lan")
swimCfg.DurationVar(&swimTCPTimeout, "tcp-timeout", 10*time.Second, "timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes")
swimCfg.IntVar(&swimIndirectChecks, "indirect-checks", 3, "number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails")
swimCfg.IntVar(&swimRetransmitMult, "retransmit-mult", 4, "multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)")
swimCfg.IntVar(&swimSuspicionMult, "suspicion-multi", 4, "multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval")
swimCfg.IntVar(&swimSuspicionMaxTimeoutMult, "suspicion-max-timeout-mult", 6, "multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout")
swimCfg.DurationVar(&swimPushPullInterval, "push-pull-interval", 30*time.Second, "interval between complete state syncs. 0 will disable state push/pull syncs")
swimCfg.DurationVar(&swimProbeInterval, "probe-interval", 1*time.Second, "interval between random node probes")
swimCfg.DurationVar(&swimProbeTimeout, "probe-timeout", 500*time.Millisecond, "timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT")
swimCfg.BoolVar(&swimDisableTcpPings, "disable-tcp-pings", false, "turn off the fallback TCP pings that are attempted if the direct UDP ping fails")
swimCfg.IntVar(&swimAwarenessMaxMultiplier, "awareness-max-multiplier", 8, "will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.")
swimCfg.IntVar(&swimGossipNodes, "gossip-nodes", 3, "number of random nodes to send gossip messages to per GossipInterval")
swimCfg.DurationVar(&swimGossipInterval, "gossip-interval", 200*time.Millisecond, "interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip")
swimCfg.DurationVar(&swimGossipToTheDeadTime, "gossip-to-the-dead-time", 30*time.Second, "interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute")
swimCfg.BoolVar(&swimEnableCompression, "enable-compression", true, "message compression")
swimCfg.StringVar(&swimDNSConfigPath, "dns-config-path", "/etc/resolv.conf", "system's DNS config file. Override allows for easier testing")
globalconf.Register("swim", swimCfg)
}

func ConfigProcess() {
Expand Down Expand Up @@ -70,4 +106,10 @@ func ConfigProcess() {
},
Timeout: httpTimeout,
}

if mode == "multi" {
if swimUseConfig != "none" && swimUseConfig != "default-lan" && swimUseConfig != "default-local" && swimUseConfig != "default-wan" {
log.Fatal(4, "CLU Config: invalid swim-use-config setting")
}
}
}
28 changes: 27 additions & 1 deletion cluster/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,33 @@ func NewMemberlistManager(thisNode Node) *MemberlistManager {
},
nodeName: thisNode.Name,
}
mgr.cfg = memberlist.DefaultLANConfig()
switch swimUseConfig {
case "none":
mgr.cfg = memberlist.DefaultLANConfig() // use this as base so that the other settings have proper defaults
mgr.cfg.TCPTimeout = swimTCPTimeout
mgr.cfg.IndirectChecks = swimIndirectChecks
mgr.cfg.RetransmitMult = swimRetransmitMult
mgr.cfg.SuspicionMult = swimSuspicionMult
mgr.cfg.SuspicionMaxTimeoutMult = swimSuspicionMaxTimeoutMult
mgr.cfg.PushPullInterval = swimPushPullInterval
mgr.cfg.ProbeInterval = swimProbeInterval
mgr.cfg.ProbeTimeout = swimProbeTimeout
mgr.cfg.DisableTcpPings = swimDisableTcpPings
mgr.cfg.AwarenessMaxMultiplier = swimAwarenessMaxMultiplier
mgr.cfg.GossipInterval = swimGossipInterval
mgr.cfg.GossipNodes = swimGossipNodes
mgr.cfg.GossipToTheDeadTime = swimGossipToTheDeadTime
mgr.cfg.EnableCompression = swimEnableCompression
mgr.cfg.DNSConfigPath = swimDNSConfigPath
case "default-lan":
mgr.cfg = memberlist.DefaultLANConfig()
case "default-local":
mgr.cfg = memberlist.DefaultLocalConfig()
case "default-wan":
mgr.cfg = memberlist.DefaultWANConfig()
default:
panic("invalid swimUseConfig. should already have been validated")
}
mgr.cfg.BindPort = clusterPort
mgr.cfg.BindAddr = clusterHost.String()
mgr.cfg.AdvertisePort = clusterPort
Expand Down
42 changes: 42 additions & 0 deletions docker/docker-chaos/metrictank.ini
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,48 @@ min-available-shards = 0
# How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
http-timeout = 60s

## SWIM clustering settings ##
# only relevant when using cluster mode 'multi'
# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
# all values correspond literally to the memberlist.Config options
[swim]
# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
# see:
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
use-config = default-lan
# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
tcp-timeout = 10s
# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
indirect-checks = 3
# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
retransmit-mult = 4
# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
suspicion-multi = 4
# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
suspicion-max-timeout-mult = 6
# interval between complete state syncs. 0 will disable state push/pull syncs
push-pull-interval = 30s
# interval between random node probes
probe-interval = 1s
# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
probe-timeout = 500ms
# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
disable-tcp-pings = false
# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
awareness-max-multiplier = 8
# number of random nodes to send gossip messages to per GossipInterval
gossip-nodes = 3
# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
gossip-interval = 200ms
# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
gossip-to-the-dead-time = 30s
# message compression
enable-compression = true
# system's DNS config file. Override allows for easier testing
dns-config-path = /etc/resolv.conf

## clustering transports for tracking chunk saves between replicated instances ##
### kafka as transport for clustering messages (recommended)
[kafka-cluster]
Expand Down
42 changes: 42 additions & 0 deletions docker/docker-cluster/metrictank.ini
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,48 @@ min-available-shards = 0
# How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
http-timeout = 60s

## SWIM clustering settings ##
# only relevant when using cluster mode 'multi'
# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
# all values correspond literally to the memberlist.Config options
[swim]
# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
# see:
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
use-config = default-lan
# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
tcp-timeout = 10s
# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
indirect-checks = 3
# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
retransmit-mult = 4
# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
suspicion-multi = 4
# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
suspicion-max-timeout-mult = 6
# interval between complete state syncs. 0 will disable state push/pull syncs
push-pull-interval = 30s
# interval between random node probes
probe-interval = 1s
# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
probe-timeout = 500ms
# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
disable-tcp-pings = false
# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
awareness-max-multiplier = 8
# number of random nodes to send gossip messages to per GossipInterval
gossip-nodes = 3
# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
gossip-interval = 200ms
# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
gossip-to-the-dead-time = 30s
# message compression
enable-compression = true
# system's DNS config file. Override allows for easier testing
dns-config-path = /etc/resolv.conf

## clustering transports for tracking chunk saves between replicated instances ##
### kafka as transport for clustering messages (recommended)
[kafka-cluster]
Expand Down
42 changes: 42 additions & 0 deletions docker/docker-dev-custom-cfg-kafka/metrictank.ini
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,48 @@ min-available-shards = 0
# How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
http-timeout = 60s

## SWIM clustering settings ##
# only relevant when using cluster mode 'multi'
# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
# all values correspond literally to the memberlist.Config options
[swim]
# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
# see:
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
use-config = default-lan
# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
tcp-timeout = 10s
# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
indirect-checks = 3
# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
retransmit-mult = 4
# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
suspicion-multi = 4
# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
suspicion-max-timeout-mult = 6
# interval between complete state syncs. 0 will disable state push/pull syncs
push-pull-interval = 30s
# interval between random node probes
probe-interval = 1s
# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
probe-timeout = 500ms
# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
disable-tcp-pings = false
# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
awareness-max-multiplier = 8
# number of random nodes to send gossip messages to per GossipInterval
gossip-nodes = 3
# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
gossip-interval = 200ms
# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
gossip-to-the-dead-time = 30s
# message compression
enable-compression = true
# system's DNS config file. Override allows for easier testing
dns-config-path = /etc/resolv.conf

## clustering transports for tracking chunk saves between replicated instances ##
### kafka as transport for clustering messages (recommended)
[kafka-cluster]
Expand Down
45 changes: 45 additions & 0 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,51 @@ min-available-shards = 0
http-timeout = 60s
```

## SWIM clustering settings ##

```
# only relevant when using cluster mode 'multi'
# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
# all values correspond literally to the memberlist.Config options
[swim]
# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
# see:
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
use-config = default-lan
# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
tcp-timeout = 10s
# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
indirect-checks = 3
# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
retransmit-mult = 4
# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
suspicion-multi = 4
# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
suspicion-max-timeout-mult = 6
# interval between complete state syncs. 0 will disable state push/pull syncs
push-pull-interval = 30s
# interval between random node probes
probe-interval = 1s
# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
probe-timeout = 500ms
# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
disable-tcp-pings = false
# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
awareness-max-multiplier = 8
# number of random nodes to send gossip messages to per GossipInterval
gossip-nodes = 3
# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
gossip-interval = 200ms
# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
gossip-to-the-dead-time = 30s
# message compression
enable-compression = true
# system's DNS config file. Override allows for easier testing
dns-config-path = /etc/resolv.conf
```

## clustering transports for tracking chunk saves between replicated instances ##
### kafka as transport for clustering messages (recommended)

Expand Down
42 changes: 42 additions & 0 deletions metrictank-sample.ini
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,48 @@ min-available-shards = 0
# How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
http-timeout = 60s

## SWIM clustering settings ##
# only relevant when using cluster mode 'multi'
# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
# all values correspond literally to the memberlist.Config options
[swim]
# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
# see:
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
use-config = default-lan
# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
tcp-timeout = 10s
# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
indirect-checks = 3
# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
retransmit-mult = 4
# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
suspicion-multi = 4
# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
suspicion-max-timeout-mult = 6
# interval between complete state syncs. 0 will disable state push/pull syncs
push-pull-interval = 30s
# interval between random node probes
probe-interval = 1s
# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
probe-timeout = 500ms
# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
disable-tcp-pings = false
# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
awareness-max-multiplier = 8
# number of random nodes to send gossip messages to per GossipInterval
gossip-nodes = 3
# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
gossip-interval = 200ms
# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
gossip-to-the-dead-time = 30s
# message compression
enable-compression = true
# system's DNS config file. Override allows for easier testing
dns-config-path = /etc/resolv.conf

## clustering transports for tracking chunk saves between replicated instances ##
### kafka as transport for clustering messages (recommended)
[kafka-cluster]
Expand Down
Loading

0 comments on commit ae3c5da

Please sign in to comment.