diff --git a/cluster/config.go b/cluster/config.go index cdacd080b7..b864580f1f 100644 --- a/cluster/config.go +++ b/cluster/config.go @@ -23,6 +23,23 @@ var ( httpTimeout time.Duration minAvailableShards int + swimUseConfig string + swimTCPTimeout time.Duration + swimIndirectChecks int + swimRetransmitMult int + swimSuspicionMult int + swimSuspicionMaxTimeoutMult int + swimPushPullInterval time.Duration + swimProbeInterval time.Duration + swimProbeTimeout time.Duration + swimDisableTcpPings bool + swimAwarenessMaxMultiplier int + swimGossipInterval time.Duration + swimGossipNodes int + swimGossipToTheDeadTime time.Duration + swimEnableCompression bool + swimDNSConfigPath string + client http.Client ) @@ -37,6 +54,25 @@ func ConfigSetup() { clusterCfg.IntVar(&maxPrio, "max-priority", 10, "maximum priority before a node should be considered not-ready.") clusterCfg.IntVar(&minAvailableShards, "min-available-shards", 0, "minimum number of shards that must be available for a query to be handled.") globalconf.Register("cluster", clusterCfg) + + swimCfg := flag.NewFlagSet("swim", flag.ExitOnError) + swimCfg.StringVar(&swimUseConfig, "use-config", "default-lan", "config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. see https://godoc.org/github.com/hashicorp/memberlist#Config . Note all our swim settings correspond to default-lan") + swimCfg.DurationVar(&swimTCPTimeout, "tcp-timeout", 10*time.Second, "timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes") + swimCfg.IntVar(&swimIndirectChecks, "indirect-checks", 3, "number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails") + swimCfg.IntVar(&swimRetransmitMult, "retransmit-mult", 4, "multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1)") + swimCfg.IntVar(&swimSuspicionMult, "suspicion-multi", 4, "multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval") + swimCfg.IntVar(&swimSuspicionMaxTimeoutMult, "suspicion-max-timeout-mult", 6, "multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout") + swimCfg.DurationVar(&swimPushPullInterval, "push-pull-interval", 30*time.Second, "interval between complete state syncs. 0 will disable state push/pull syncs") + swimCfg.DurationVar(&swimProbeInterval, "probe-interval", 1*time.Second, "interval between random node probes") + swimCfg.DurationVar(&swimProbeTimeout, "probe-timeout", 500*time.Millisecond, "timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT") + swimCfg.BoolVar(&swimDisableTcpPings, "disable-tcp-pings", false, "turn off the fallback TCP pings that are attempted if the direct UDP ping fails") + swimCfg.IntVar(&swimAwarenessMaxMultiplier, "awareness-max-multiplier", 8, "will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes.") + swimCfg.IntVar(&swimGossipNodes, "gossip-nodes", 3, "number of random nodes to send gossip messages to per GossipInterval") + swimCfg.DurationVar(&swimGossipInterval, "gossip-interval", 200*time.Millisecond, "interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip") + swimCfg.DurationVar(&swimGossipToTheDeadTime, "gossip-to-the-dead-time", 30*time.Second, "interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute") + swimCfg.BoolVar(&swimEnableCompression, "enable-compression", true, "message compression") + swimCfg.StringVar(&swimDNSConfigPath, "dns-config-path", "/etc/resolv.conf", "system's DNS config file. Override allows for easier testing") + globalconf.Register("swim", swimCfg) } func ConfigProcess() { @@ -70,4 +106,10 @@ func ConfigProcess() { }, Timeout: httpTimeout, } + + if mode == "multi" { + if swimUseConfig != "none" && swimUseConfig != "default-lan" && swimUseConfig != "default-local" && swimUseConfig != "default-wan" { + log.Fatal(4, "CLU Config: invalid swim-use-config setting") + } + } } diff --git a/cluster/manager.go b/cluster/manager.go index a897c0954d..4d1a9268cd 100644 --- a/cluster/manager.go +++ b/cluster/manager.go @@ -80,7 +80,33 @@ func NewMemberlistManager(thisNode Node) *MemberlistManager { }, nodeName: thisNode.Name, } - mgr.cfg = memberlist.DefaultLANConfig() + switch swimUseConfig { + case "none": + mgr.cfg = memberlist.DefaultLANConfig() // use this as base so that the other settings have proper defaults + mgr.cfg.TCPTimeout = swimTCPTimeout + mgr.cfg.IndirectChecks = swimIndirectChecks + mgr.cfg.RetransmitMult = swimRetransmitMult + mgr.cfg.SuspicionMult = swimSuspicionMult + mgr.cfg.SuspicionMaxTimeoutMult = swimSuspicionMaxTimeoutMult + mgr.cfg.PushPullInterval = swimPushPullInterval + mgr.cfg.ProbeInterval = swimProbeInterval + mgr.cfg.ProbeTimeout = swimProbeTimeout + mgr.cfg.DisableTcpPings = swimDisableTcpPings + mgr.cfg.AwarenessMaxMultiplier = swimAwarenessMaxMultiplier + mgr.cfg.GossipInterval = swimGossipInterval + mgr.cfg.GossipNodes = swimGossipNodes + mgr.cfg.GossipToTheDeadTime = swimGossipToTheDeadTime + mgr.cfg.EnableCompression = swimEnableCompression + mgr.cfg.DNSConfigPath = swimDNSConfigPath + case "default-lan": + mgr.cfg = memberlist.DefaultLANConfig() + case "default-local": + mgr.cfg = memberlist.DefaultLocalConfig() + case "default-wan": + mgr.cfg = memberlist.DefaultWANConfig() + default: + panic("invalid swimUseConfig. should already have been validated") + } mgr.cfg.BindPort = clusterPort mgr.cfg.BindAddr = clusterHost.String() mgr.cfg.AdvertisePort = clusterPort diff --git a/docker/docker-chaos/metrictank.ini b/docker/docker-chaos/metrictank.ini index 048972fdaa..20a8cd7f6f 100644 --- a/docker/docker-chaos/metrictank.ini +++ b/docker/docker-chaos/metrictank.ini @@ -207,6 +207,48 @@ min-available-shards = 0 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable http-timeout = 60s +## SWIM clustering settings ## +# only relevant when using cluster mode 'multi' +# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config +# all values correspond literally to the memberlist.Config options +[swim] +# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan") +# see: +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig +use-config = default-lan +# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes +tcp-timeout = 10s +# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails +indirect-checks = 3 +# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1) +retransmit-mult = 4 +# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval +suspicion-multi = 4 +# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout +suspicion-max-timeout-mult = 6 +# interval between complete state syncs. 0 will disable state push/pull syncs +push-pull-interval = 30s +# interval between random node probes +probe-interval = 1s +# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT +probe-timeout = 500ms +# turn off the fallback TCP pings that are attempted if the direct UDP ping fails +disable-tcp-pings = false +# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes. +awareness-max-multiplier = 8 +# number of random nodes to send gossip messages to per GossipInterval +gossip-nodes = 3 +# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip +gossip-interval = 200ms +# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute +gossip-to-the-dead-time = 30s +# message compression +enable-compression = true +# system's DNS config file. Override allows for easier testing +dns-config-path = /etc/resolv.conf + ## clustering transports for tracking chunk saves between replicated instances ## ### kafka as transport for clustering messages (recommended) [kafka-cluster] diff --git a/docker/docker-cluster/metrictank.ini b/docker/docker-cluster/metrictank.ini index 048972fdaa..20a8cd7f6f 100644 --- a/docker/docker-cluster/metrictank.ini +++ b/docker/docker-cluster/metrictank.ini @@ -207,6 +207,48 @@ min-available-shards = 0 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable http-timeout = 60s +## SWIM clustering settings ## +# only relevant when using cluster mode 'multi' +# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config +# all values correspond literally to the memberlist.Config options +[swim] +# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan") +# see: +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig +use-config = default-lan +# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes +tcp-timeout = 10s +# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails +indirect-checks = 3 +# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1) +retransmit-mult = 4 +# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval +suspicion-multi = 4 +# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout +suspicion-max-timeout-mult = 6 +# interval between complete state syncs. 0 will disable state push/pull syncs +push-pull-interval = 30s +# interval between random node probes +probe-interval = 1s +# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT +probe-timeout = 500ms +# turn off the fallback TCP pings that are attempted if the direct UDP ping fails +disable-tcp-pings = false +# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes. +awareness-max-multiplier = 8 +# number of random nodes to send gossip messages to per GossipInterval +gossip-nodes = 3 +# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip +gossip-interval = 200ms +# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute +gossip-to-the-dead-time = 30s +# message compression +enable-compression = true +# system's DNS config file. Override allows for easier testing +dns-config-path = /etc/resolv.conf + ## clustering transports for tracking chunk saves between replicated instances ## ### kafka as transport for clustering messages (recommended) [kafka-cluster] diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini index 7cb6f96242..3b888e761a 100644 --- a/docker/docker-dev-custom-cfg-kafka/metrictank.ini +++ b/docker/docker-dev-custom-cfg-kafka/metrictank.ini @@ -207,6 +207,48 @@ min-available-shards = 0 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable http-timeout = 60s +## SWIM clustering settings ## +# only relevant when using cluster mode 'multi' +# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config +# all values correspond literally to the memberlist.Config options +[swim] +# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan") +# see: +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig +use-config = default-lan +# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes +tcp-timeout = 10s +# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails +indirect-checks = 3 +# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1) +retransmit-mult = 4 +# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval +suspicion-multi = 4 +# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout +suspicion-max-timeout-mult = 6 +# interval between complete state syncs. 0 will disable state push/pull syncs +push-pull-interval = 30s +# interval between random node probes +probe-interval = 1s +# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT +probe-timeout = 500ms +# turn off the fallback TCP pings that are attempted if the direct UDP ping fails +disable-tcp-pings = false +# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes. +awareness-max-multiplier = 8 +# number of random nodes to send gossip messages to per GossipInterval +gossip-nodes = 3 +# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip +gossip-interval = 200ms +# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute +gossip-to-the-dead-time = 30s +# message compression +enable-compression = true +# system's DNS config file. Override allows for easier testing +dns-config-path = /etc/resolv.conf + ## clustering transports for tracking chunk saves between replicated instances ## ### kafka as transport for clustering messages (recommended) [kafka-cluster] diff --git a/docs/config.md b/docs/config.md index 2200cfa6e2..1a466c4cf3 100644 --- a/docs/config.md +++ b/docs/config.md @@ -257,6 +257,51 @@ min-available-shards = 0 http-timeout = 60s ``` +## SWIM clustering settings ## + +``` +# only relevant when using cluster mode 'multi' +# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config +# all values correspond literally to the memberlist.Config options +[swim] +# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan") +# see: +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig +use-config = default-lan +# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes +tcp-timeout = 10s +# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails +indirect-checks = 3 +# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1) +retransmit-mult = 4 +# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval +suspicion-multi = 4 +# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout +suspicion-max-timeout-mult = 6 +# interval between complete state syncs. 0 will disable state push/pull syncs +push-pull-interval = 30s +# interval between random node probes +probe-interval = 1s +# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT +probe-timeout = 500ms +# turn off the fallback TCP pings that are attempted if the direct UDP ping fails +disable-tcp-pings = false +# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes. +awareness-max-multiplier = 8 +# number of random nodes to send gossip messages to per GossipInterval +gossip-nodes = 3 +# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip +gossip-interval = 200ms +# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute +gossip-to-the-dead-time = 30s +# message compression +enable-compression = true +# system's DNS config file. Override allows for easier testing +dns-config-path = /etc/resolv.conf +``` + ## clustering transports for tracking chunk saves between replicated instances ## ### kafka as transport for clustering messages (recommended) diff --git a/metrictank-sample.ini b/metrictank-sample.ini index ce503eca8f..8be90deab0 100644 --- a/metrictank-sample.ini +++ b/metrictank-sample.ini @@ -210,6 +210,48 @@ min-available-shards = 0 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable http-timeout = 60s +## SWIM clustering settings ## +# only relevant when using cluster mode 'multi' +# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config +# all values correspond literally to the memberlist.Config options +[swim] +# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan") +# see: +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig +use-config = default-lan +# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes +tcp-timeout = 10s +# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails +indirect-checks = 3 +# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1) +retransmit-mult = 4 +# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval +suspicion-multi = 4 +# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout +suspicion-max-timeout-mult = 6 +# interval between complete state syncs. 0 will disable state push/pull syncs +push-pull-interval = 30s +# interval between random node probes +probe-interval = 1s +# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT +probe-timeout = 500ms +# turn off the fallback TCP pings that are attempted if the direct UDP ping fails +disable-tcp-pings = false +# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes. +awareness-max-multiplier = 8 +# number of random nodes to send gossip messages to per GossipInterval +gossip-nodes = 3 +# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip +gossip-interval = 200ms +# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute +gossip-to-the-dead-time = 30s +# message compression +enable-compression = true +# system's DNS config file. Override allows for easier testing +dns-config-path = /etc/resolv.conf + ## clustering transports for tracking chunk saves between replicated instances ## ### kafka as transport for clustering messages (recommended) [kafka-cluster] diff --git a/scripts/config/metrictank-docker.ini b/scripts/config/metrictank-docker.ini index 32dade5fe4..0fbc97eb81 100644 --- a/scripts/config/metrictank-docker.ini +++ b/scripts/config/metrictank-docker.ini @@ -207,6 +207,48 @@ min-available-shards = 0 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable http-timeout = 60s +## SWIM clustering settings ## +# only relevant when using cluster mode 'multi' +# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config +# all values correspond literally to the memberlist.Config options +[swim] +# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan") +# see: +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig +use-config = default-lan +# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes +tcp-timeout = 10s +# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails +indirect-checks = 3 +# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1) +retransmit-mult = 4 +# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval +suspicion-multi = 4 +# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout +suspicion-max-timeout-mult = 6 +# interval between complete state syncs. 0 will disable state push/pull syncs +push-pull-interval = 30s +# interval between random node probes +probe-interval = 1s +# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT +probe-timeout = 500ms +# turn off the fallback TCP pings that are attempted if the direct UDP ping fails +disable-tcp-pings = false +# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes. +awareness-max-multiplier = 8 +# number of random nodes to send gossip messages to per GossipInterval +gossip-nodes = 3 +# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip +gossip-interval = 200ms +# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute +gossip-to-the-dead-time = 30s +# message compression +enable-compression = true +# system's DNS config file. Override allows for easier testing +dns-config-path = /etc/resolv.conf + ## clustering transports for tracking chunk saves between replicated instances ## ### kafka as transport for clustering messages (recommended) [kafka-cluster] diff --git a/scripts/config/metrictank-package.ini b/scripts/config/metrictank-package.ini index 401708b22a..aeeab48870 100644 --- a/scripts/config/metrictank-package.ini +++ b/scripts/config/metrictank-package.ini @@ -207,6 +207,48 @@ min-available-shards = 0 # How long to wait before aborting http requests to cluster peers and returning a http 503 service unavailable http-timeout = 60s +## SWIM clustering settings ## +# only relevant when using cluster mode 'multi' +# for more details, see https://godoc.org/github.com/hashicorp/memberlist#Config +# all values correspond literally to the memberlist.Config options +[swim] +# config setting to use. If set, will override all other swim settings. Use none|default-lan|default-local|default-wan. Note all our swim settings correspond to default-lan") +# see: +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLANConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultLocalConfig +# * https://godoc.org/github.com/hashicorp/memberlist#DefaultWANConfig +use-config = default-lan +# timeout for establishing a stream connection with peers for a full state sync, and for stream reads and writes +tcp-timeout = 10s +# number of nodes that will be asked to perform an indirect probe of a node in the case a direct probe fails +indirect-checks = 3 +# multiplier for number of retransmissions for gossip messages. Retransmits = RetransmitMult * log(N+1) +retransmit-mult = 4 +# multiplier for determining when inaccessible/suspect node is delared dead. SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval +suspicion-multi = 4 +# multiplier for upper bound on detection time. SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout +suspicion-max-timeout-mult = 6 +# interval between complete state syncs. 0 will disable state push/pull syncs +push-pull-interval = 30s +# interval between random node probes +probe-interval = 1s +# timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of network RTT +probe-timeout = 500ms +# turn off the fallback TCP pings that are attempted if the direct UDP ping fails +disable-tcp-pings = false +# will increase the probe interval if the node becomes aware that it might be degraded and not meeting the soft real time requirements to reliably probe other nodes. +awareness-max-multiplier = 8 +# number of random nodes to send gossip messages to per GossipInterval +gossip-nodes = 3 +# interval between sending messages that need to be gossiped that haven't been able to piggyback on probing messages. 0 disables non-piggyback gossip +gossip-interval = 200ms +# interval after which a node has died that we will still try to gossip to it. This gives it a chance to refute +gossip-to-the-dead-time = 30s +# message compression +enable-compression = true +# system's DNS config file. Override allows for easier testing +dns-config-path = /etc/resolv.conf + ## clustering transports for tracking chunk saves between replicated instances ## ### kafka as transport for clustering messages (recommended) [kafka-cluster]