support overriding SWIM (memberlist) performance tuneables

grafana · Dec 1, 2017 · ae3c5da · ae3c5da
1 parent 7f40597
commit ae3c5da
Show file tree

Hide file tree

Showing 9 changed files with 366 additions and 1 deletion.
diff --git a/cluster/config.go b/cluster/config.go
@@ -23,6 +23,23 @@ var (
 	httpTimeout        time.Duration
 	minAvailableShards int
 
+	swimUseConfig               string
+	swimTCPTimeout              time.Duration
+	swimIndirectChecks          int
+	swimRetransmitMult          int
+	swimSuspicionMult           int
+	swimSuspicionMaxTimeoutMult int
+	swimPushPullInterval        time.Duration
+	swimProbeInterval           time.Duration
+	swimProbeTimeout            time.Duration
+	swimDisableTcpPings         bool
+	swimAwarenessMaxMultiplier  int
+	swimGossipInterval          time.Duration
+	swimGossipNodes             int
+	swimGossipToTheDeadTime     time.Duration
+	swimEnableCompression       bool
+	swimDNSConfigPath           string
+
 	client http.Client
 )
 
@@ -37,6 +54,25 @@ func ConfigSetup() {
 	clusterCfg.IntVar(&maxPrio, "max-priority", 10, "maximum priority before a node should be considered not-ready.")
 	clusterCfg.IntVar(&minAvailableShards, "min-available-shards", 0, "minimum number of shards that must be available for a query to be handled.")
 	globalconf.Register("cluster", clusterCfg)
+
+	swimCfg := flag.NewFlagSet("swim", flag.ExitOnError)
+	swimCfg.StringVar(&swimUseConfig, "use-config", "default-lan", "config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. see https://godoc.org/github.com/hashicorp/memberlist#Config . Note all our swim settings correspond to default-lan")
+	swimCfg.DurationVar(&swimTCPTimeout, "tcp-timeout", 10*time.Second, "timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes")
+	swimCfg.IntVar(&swimIndirectChecks, "indirect-checks", 3, "number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails")
+	swimCfg.IntVar(&swimRetransmitMult, "retransmit-mult", 4, "multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)")
+	swimCfg.IntVar(&swimSuspicionMult, "suspicion-multi", 4, "multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval")
+	swimCfg.IntVar(&swimSuspicionMaxTimeoutMult, "suspicion-max-timeout-mult", 6, "multiplier for upper bound on detection time.  SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout")
+	swimCfg.DurationVar(&swimPushPullInterval, "push-pull-interval", 30*time.Second, "interval between complete state syncs. 0 will disable state push/pull syncs")
+	swimCfg.DurationVar(&swimProbeInterval, "probe-interval", 1*time.Second, "interval between random node probes")
+	swimCfg.DurationVar(&swimProbeTimeout, "probe-timeout", 500*time.Millisecond, "timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT")
+	swimCfg.BoolVar(&swimDisableTcpPings, "disable-tcp-pings", false, "turn off the fallback TCP pings that are attempted if the direct UDP ping fails")
+	swimCfg.IntVar(&swimAwarenessMaxMultiplier, "awareness-max-multiplier", 8, "will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.")
+	swimCfg.IntVar(&swimGossipNodes, "gossip-nodes", 3, "number of random nodes to send gossip messages to per GossipInterval")
+	swimCfg.DurationVar(&swimGossipInterval, "gossip-interval", 200*time.Millisecond, "interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip")
+	swimCfg.DurationVar(&swimGossipToTheDeadTime, "gossip-to-the-dead-time", 30*time.Second, "interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute")
+	swimCfg.BoolVar(&swimEnableCompression, "enable-compression", true, "message compression")
+	swimCfg.StringVar(&swimDNSConfigPath, "dns-config-path", "/etc/resolv.conf", "system's DNS config file. Override allows for easier testing")
+	globalconf.Register("swim", swimCfg)
 }
 
 func ConfigProcess() {
@@ -70,4 +106,10 @@ func ConfigProcess() {
 		},
 		Timeout: httpTimeout,
 	}
+
+	if mode == "multi" {
+		if swimUseConfig != "none" && swimUseConfig != "default-lan" && swimUseConfig != "default-local" && swimUseConfig != "default-wan" {
+			log.Fatal(4, "CLU Config: invalid swim-use-config setting")
+		}
+	}
 }
diff --git a/cluster/manager.go b/cluster/manager.go
@@ -80,7 +80,33 @@ func NewMemberlistManager(thisNode Node) *MemberlistManager {
 		},
 		nodeName: thisNode.Name,
 	}
-	mgr.cfg = memberlist.DefaultLANConfig()
+	switch swimUseConfig {
+	case "none":
+		mgr.cfg = memberlist.DefaultLANConfig() // use this as base so that the other settings have proper defaults
+		mgr.cfg.TCPTimeout = swimTCPTimeout
+		mgr.cfg.IndirectChecks = swimIndirectChecks
+		mgr.cfg.RetransmitMult = swimRetransmitMult
+		mgr.cfg.SuspicionMult = swimSuspicionMult
+		mgr.cfg.SuspicionMaxTimeoutMult = swimSuspicionMaxTimeoutMult
+		mgr.cfg.PushPullInterval = swimPushPullInterval
+		mgr.cfg.ProbeInterval = swimProbeInterval
+		mgr.cfg.ProbeTimeout = swimProbeTimeout
+		mgr.cfg.DisableTcpPings = swimDisableTcpPings
+		mgr.cfg.AwarenessMaxMultiplier = swimAwarenessMaxMultiplier
+		mgr.cfg.GossipInterval = swimGossipInterval
+		mgr.cfg.GossipNodes = swimGossipNodes
+		mgr.cfg.GossipToTheDeadTime = swimGossipToTheDeadTime
+		mgr.cfg.EnableCompression = swimEnableCompression
+		mgr.cfg.DNSConfigPath = swimDNSConfigPath
+	case "default-lan":
+		mgr.cfg = memberlist.DefaultLANConfig()
+	case "default-local":
+		mgr.cfg = memberlist.DefaultLocalConfig()
+	case "default-wan":
+		mgr.cfg = memberlist.DefaultWANConfig()
+	default:
+		panic("invalid swimUseConfig. should already have been validated")
+	}
 	mgr.cfg.BindPort = clusterPort
 	mgr.cfg.BindAddr = clusterHost.String()
 	mgr.cfg.AdvertisePort = clusterPort

diff --git a/docker/docker-chaos/metrictank.ini b/docker/docker-chaos/metrictank.ini
@@ -207,6 +207,48 @@ min-available-shards = 0
 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
 http-timeout = 60s
 
+## SWIM clustering settings ##
+# only relevant when using cluster mode 'multi'
+# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
+# all values correspond literally to the memberlist.Config options
+[swim]
+# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
+# see:
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
+use-config = default-lan
+# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
+tcp-timeout = 10s
+# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
+indirect-checks = 3
+# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
+retransmit-mult = 4
+# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
+suspicion-multi = 4
+# multiplier for upper bound on detection time.  SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
+suspicion-max-timeout-mult = 6
+# interval between complete state syncs. 0 will disable state push/pull syncs
+push-pull-interval = 30s
+# interval between random node probes
+probe-interval = 1s
+# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
+probe-timeout = 500ms
+# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
+disable-tcp-pings = false
+# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
+awareness-max-multiplier = 8
+# number of random nodes to send gossip messages to per GossipInterval
+gossip-nodes = 3
+# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
+gossip-interval = 200ms
+# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
+gossip-to-the-dead-time = 30s
+# message compression
+enable-compression = true
+# system's DNS config file. Override allows for easier testing
+dns-config-path = /etc/resolv.conf
+
 ## clustering transports for tracking chunk saves between replicated instances ##
 ### kafka as transport for clustering messages (recommended)
 [kafka-cluster]

diff --git a/docker/docker-cluster/metrictank.ini b/docker/docker-cluster/metrictank.ini
@@ -207,6 +207,48 @@ min-available-shards = 0
 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
 http-timeout = 60s
 
+## SWIM clustering settings ##
+# only relevant when using cluster mode 'multi'
+# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
+# all values correspond literally to the memberlist.Config options
+[swim]
+# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
+# see:
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
+use-config = default-lan
+# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
+tcp-timeout = 10s
+# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
+indirect-checks = 3
+# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
+retransmit-mult = 4
+# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
+suspicion-multi = 4
+# multiplier for upper bound on detection time.  SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
+suspicion-max-timeout-mult = 6
+# interval between complete state syncs. 0 will disable state push/pull syncs
+push-pull-interval = 30s
+# interval between random node probes
+probe-interval = 1s
+# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
+probe-timeout = 500ms
+# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
+disable-tcp-pings = false
+# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
+awareness-max-multiplier = 8
+# number of random nodes to send gossip messages to per GossipInterval
+gossip-nodes = 3
+# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
+gossip-interval = 200ms
+# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
+gossip-to-the-dead-time = 30s
+# message compression
+enable-compression = true
+# system's DNS config file. Override allows for easier testing
+dns-config-path = /etc/resolv.conf
+
 ## clustering transports for tracking chunk saves between replicated instances ##
 ### kafka as transport for clustering messages (recommended)
 [kafka-cluster]

diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini
@@ -207,6 +207,48 @@ min-available-shards = 0
 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
 http-timeout = 60s
 
+## SWIM clustering settings ##
+# only relevant when using cluster mode 'multi'
+# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
+# all values correspond literally to the memberlist.Config options
+[swim]
+# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
+# see:
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
+use-config = default-lan
+# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
+tcp-timeout = 10s
+# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
+indirect-checks = 3
+# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
+retransmit-mult = 4
+# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
+suspicion-multi = 4
+# multiplier for upper bound on detection time.  SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
+suspicion-max-timeout-mult = 6
+# interval between complete state syncs. 0 will disable state push/pull syncs
+push-pull-interval = 30s
+# interval between random node probes
+probe-interval = 1s
+# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
+probe-timeout = 500ms
+# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
+disable-tcp-pings = false
+# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
+awareness-max-multiplier = 8
+# number of random nodes to send gossip messages to per GossipInterval
+gossip-nodes = 3
+# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
+gossip-interval = 200ms
+# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
+gossip-to-the-dead-time = 30s
+# message compression
+enable-compression = true
+# system's DNS config file. Override allows for easier testing
+dns-config-path = /etc/resolv.conf
+
 ## clustering transports for tracking chunk saves between replicated instances ##
 ### kafka as transport for clustering messages (recommended)
 [kafka-cluster]

diff --git a/docs/config.md b/docs/config.md
@@ -257,6 +257,51 @@ min-available-shards = 0
 http-timeout = 60s
 ```
 
+## SWIM clustering settings ##
+
+```
+# only relevant when using cluster mode 'multi'
+# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
+# all values correspond literally to the memberlist.Config options
+[swim]
+# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
+# see:
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
+use-config = default-lan
+# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
+tcp-timeout = 10s
+# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
+indirect-checks = 3
+# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
+retransmit-mult = 4
+# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
+suspicion-multi = 4
+# multiplier for upper bound on detection time.  SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
+suspicion-max-timeout-mult = 6
+# interval between complete state syncs. 0 will disable state push/pull syncs
+push-pull-interval = 30s
+# interval between random node probes
+probe-interval = 1s
+# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
+probe-timeout = 500ms
+# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
+disable-tcp-pings = false
+# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
+awareness-max-multiplier = 8
+# number of random nodes to send gossip messages to per GossipInterval
+gossip-nodes = 3
+# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
+gossip-interval = 200ms
+# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
+gossip-to-the-dead-time = 30s
+# message compression
+enable-compression = true
+# system's DNS config file. Override allows for easier testing
+dns-config-path = /etc/resolv.conf
+```
+
 ## clustering transports for tracking chunk saves between replicated instances ##
 ### kafka as transport for clustering messages (recommended)
 

diff --git a/metrictank-sample.ini b/metrictank-sample.ini
@@ -210,6 +210,48 @@ min-available-shards = 0
 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable
 http-timeout = 60s
 
+## SWIM clustering settings ##
+# only relevant when using cluster mode 'multi'
+# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config
+# all values correspond literally to the memberlist.Config options
+[swim]
+# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan")
+# see:
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig
+# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig
+use-config = default-lan
+# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes
+tcp-timeout = 10s
+# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails
+indirect-checks = 3
+# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)
+retransmit-mult = 4
+# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval
+suspicion-multi = 4
+# multiplier for upper bound on detection time.  SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout
+suspicion-max-timeout-mult = 6
+# interval between complete state syncs. 0 will disable state push/pull syncs
+push-pull-interval = 30s
+# interval between random node probes
+probe-interval = 1s
+# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT
+probe-timeout = 500ms
+# turn off the fallback TCP pings that are attempted if the direct UDP ping fails
+disable-tcp-pings = false
+# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.
+awareness-max-multiplier = 8
+# number of random nodes to send gossip messages to per GossipInterval
+gossip-nodes = 3
+# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip
+gossip-interval = 200ms
+# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute
+gossip-to-the-dead-time = 30s
+# message compression
+enable-compression = true
+# system's DNS config file. Override allows for easier testing
+dns-config-path = /etc/resolv.conf
+
 ## clustering transports for tracking chunk saves between replicated instances ##
 ### kafka as transport for clustering messages (recommended)
 [kafka-cluster]