From ff202f69f624a1ef842d29c387d427687d4f7efc Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Tue, 9 Mar 2021 16:49:05 -0600 Subject: [PATCH] Cleanup Akka.Cluster.SBR for release (#4755) * added DFN copyright headers to SBR files * formatted SBR code * completed documentation changes * fixed DocFx markdown * updated SBR docs --- .../clustering/split-brain-resolver.md | 136 +++- .../SBR/SplitBrainResolverSpec.cs | 9 +- .../Akka.Cluster/Configuration/Cluster.conf | 6 - src/core/Akka.Cluster/SBR/DowningStrategy.cs | 666 ++++++++---------- .../Akka.Cluster/SBR/SplitBrainResolver.cs | 477 ++++++------- .../SBR/SplitBrainResolverProvider.cs | 29 +- .../SBR/SplitBrainResolverSettings.cs | 80 ++- 7 files changed, 706 insertions(+), 697 deletions(-) diff --git a/docs/articles/clustering/split-brain-resolver.md b/docs/articles/clustering/split-brain-resolver.md index 17b75124b4a..bce3fcf8409 100644 --- a/docs/articles/clustering/split-brain-resolver.md +++ b/docs/articles/clustering/split-brain-resolver.md @@ -4,9 +4,6 @@ title: Split Brain Resolver --- # Split Brain Resolver -> [!NOTE] -> While this feature is based on [Lightbend Reactive Platform Split Brain Resolver](https://doc.akka.io/docs/akka/rp-16s01p02/scala/split-brain-resolver.html) feature description, however its implementation is a result of free contribution and interpretation of Akka.NET team. Lightbend doesn't take any responsibility for the state and correctness of it. - When working with an Akka.NET cluster, you must consider how to handle [network partitions](https://en.wikipedia.org/wiki/Network_partition) (a.k.a. split brain scenarios) and machine crashes (including .NET CLR/Core and hardware failures). This is crucial for correct behavior of your cluster, especially if you use Cluster Singleton or Cluster Sharding. ## The Problem @@ -28,7 +25,7 @@ Split brain resolver feature brings ability to apply different strategies for ma ```hocon akka.cluster { - downing-provider-class = "Akka.Cluster.SplitBrainResolver, Akka.Cluster" + downing-provider-class = "Akka.Cluster.SBR.SplitBrainResolverProvider, Akka.Cluster" split-brain-resolver { active-strategy = } @@ -37,21 +34,39 @@ akka.cluster { Keep in mind that split brain resolver will NOT work when `akka.cluster.auto-down-unreachable-after` is used. +## Split Brain Resolution Strategies +Beginning in Akka.NET v1.4.16, the Akka.NET project has ported the original split brain resolver implementations from Lightbend as they are now open source. The following section of documentation describes how Akka.NET's hand-rolled split brain resolvers are implemented. -## Strategies +### Picking a Strategy +In order to enable an Akka.NET split brain resolver in your cluster (they are not enabled by default), you will want to update your `akka.cluster` HOCON configuration to the following: -This section describes the different split brain resolver strategies. Please keep in mind, that there's no universal solution and each one of them may fail under specific circumstances. +```hocon +akka.cluster { + downing-provider-class = "Akka.Cluster.SBR.SplitBrainResolverProvider, Akka.Cluster" + split-brain-resolver { + active-strategy = + } +} +``` -To decide which strategy to use, you can set `akka.cluster.split-brain-resolver.active-strategy` to one of 4 different options: +This will cause the [`Akka.Cluster.SBR.SplitBrainResolverProvider`](xref:Akka.Cluster.SBR.SplitBrainResolverProvider) to be loaded by Akka.Cluster at startup and it will automatically begin executing your configured `active-strategy` on each member node that joins the cluster. -- `static-quorum` -- `keep-majority` -- `keep-oldest` -- `keep-referee` +> [!IMPORTANT] +> _The cluster leader_ on either side of the partition is the only one who actually downs unreachable nodes in the event of a split brain - so it is _essential_ that every node in the cluster be configured using the same split brain resolver settings. Otherwise there's no way to guarantee predictable behavior when network partitions occur. + +The following strategies are supported: + +* `static-quorum` +* `keep-majority` +* `keep-oldest` +* `down-all` +* `lease-majority` +* `keep-referee` - only available with the legacy split brain resolver. All strategies will be applied only after cluster state has reached stability for specified time threshold (no nodes transitioning between different states for some time), specified by `stable-after` setting. Nodes which are joining will not affect this treshold, as they won't be promoted to UP status in face unreachable nodes. For the same reason they won't be taken into account, when a strategy will be applied. ```hocon +akka.cluster.downing-provider-class = "Akka.Cluster.SBR.SplitBrainResolverProvider, Akka.Cluster" akka.cluster.split-brain-resolver { # Enable one of the available strategies (see descriptions below): # static-quorum, keep-majority, keep-oldest, keep-referee @@ -60,20 +75,41 @@ akka.cluster.split-brain-resolver { # Decision is taken by the strategy when there has been no membership or # reachability changes for this duration, i.e. the cluster state is stable. stable-after = 20s + + # When reachability observations by the failure detector are changed the SBR decisions + # are deferred until there are no changes within the 'stable-after' duration. + # If this continues for too long it might be an indication of an unstable system/network + # and it could result in delayed or conflicting decisions on separate sides of a network + # partition. + # + # As a precaution for that scenario all nodes are downed if no decision is made within + # `stable-after + down-all-when-unstable` from the first unreachability event. + # The measurement is reset if all unreachable have been healed, downed or removed, or + # if there are no changes within `stable-after * 2`. + # + # The value can be on, off, or a duration. + # + # By default it is 'on' and then it is derived to be 3/4 of stable-after, but not less than + # 4 seconds. + down-all-when-unstable = on } ``` -There is no simple way to decide the value of `stable-after`, as shorter value will give you the faster reaction time for unreachable nodes at cost of higher risk of false failures detected - due to temporary network issues. The rule of thumb for this setting is to set `stable-after` to `log10(maxExpectedNumberOfNodes) * 10`. +There is no simple way to decide the value of `stable-after`, as: -Remember that if a strategy will decide to down a particular node, it won't shutdown the related `ActorSystem`. In order to do so, use cluster removal hook like this: +* A shorter value will give you the faster reaction time for unreachable nodes at cost of higher risk of false positives, i.e. healthy nodes that are slow to be observed as reachable again prematurely being removed for the cluster due to temporary network issues. +* A higher value will increase the amount of time it takes to move resources on the truly unreachable side of the partition, i.e. sharded actors, cluster singletons, DData replicas, and so on longer to be re-homed onto reachable nodes in the healthy partition. -```csharp -Cluster.Get(system).RegisterOnMemberRemoved(() => { - system.Terminate().Wait(); -}); -``` +> [!NOTE] +> The rule of thumb for this setting is to set `stable-after` to `log10(maxExpectedNumberOfNodes) * 10`. + +The `down-all-when-unstable` option, which is _enabled by default_, will terminate the entire cluster in the event that cluster instability lasts for longer than the `stable-after` + 3/4 of the `stable-after` value in seconds - so by default, 35 seconds. -### Static Quorum +> [!IMPORTANT] +> If you are running in an environment where processes are not automatically restarted in the event of an unplanned termination (i.e. Kubernetes), we strongly recommend that you disable this setting by setting `akka.cluster.split-brain-resolver.down-all-when-unstable = off`. +> If you're running in a self-hosted environment or on infrastructure as a service, TURN THIS SETTING OFF unless you have automatic process supervision in-place (which you should always try to have.) + +#### Static Quorum The `static-quorum` strategy works well, when you are able to define minimum required cluster size. It will down unreachable nodes if the number of reachable ones is greater than or equal to a configured `quorum-size`. Otherwise reachable ones will be downed. @@ -104,7 +140,7 @@ akka.cluster.split-brain-resolver { } ``` -### Keep Majority +#### Keep Majority The `keep-majority` strategy will down this part of the cluster, which sees a lesser part of the whole cluster. This choice is made based on the latest known state of the cluster. When cluster will split into two equal parts, the one which contains the lowest address, will survive. @@ -131,7 +167,7 @@ akka.cluster.split-brain-resolver { } ``` -### Keep Oldest +#### Keep Oldest The `keep-oldest` strategy, when a network split has happened, will down a part of the cluster which doesn't contain the oldest node. @@ -163,7 +199,36 @@ akka.cluster.split-brain-resolver { } ``` -### Keep Referee +#### Down All +As the name implies, this strategy results in all members of the being downed unconditionally - forcing a full rebuild and recreation of the entire cluster if there are any unreachable nodes alive for longer than `akka.cluster.split-brain-resolver.stable-after` (20 seconds by default.) + +You can enable this strategy via the following: + +```hocon +akka.cluster { + downing-provider-class = "Akka.Cluster.SBR.SplitBrainResolverProvider, Akka.Cluster" + active-strategy = down-all +} +``` + +#### Keep Referee +This strategy is only available with the legacy Akka.Cluster split brain resolver, which you can enable via the following HOCON: + +```hocon +akka.cluster { + downing-provider-class = "Akka.Cluster.SplitBrainResolver, Akka.Cluster" + active-strategy = keep-referee + + keep-referee { + # referee address on the form of "akka.tcp://system@hostname:port" + address = "" + down-all-if-less-than-nodes = 1 + } +} +``` + +> [!WARNING] +> Akka.NET's hand-rolled split brain resolvers are deprecated and will be removed from Akka.NET as part of the Akka.NET v1.5 update. Please see "[Split Brain Resolution Strategies](#split-brain-resolution-strategies)" for the current guidance as of Akka.NET v1.4.17. The `keep-referee` strategy will simply down the part that does not contain the given referee node. @@ -176,27 +241,34 @@ Things to keep in mind: You can configure a minimum required amount of reachable nodes to maintain operability by using `down-all-if-less-than-nodes`. If a strategy will detect that the number of reachable nodes will go below that minimum it will down the entire partition even when referee node was reachable. -Configuration: +#### Lease Majority +The `lease-majority` downing provider strategy keeps all of the nodes in the cluster who are able to successfully acquire an [`Akka.Coordination.Lease`](xref:Akka.Coordination.Lease) - and the implementation of which must be specified by the user via configuration: ```hocon -akka.cluster.split-brain-resolver { - active-strategy = keep-referee +akka.cluster.split-brain-resolver.lease-majority { + lease-implementation = "" - keep-referee { - # referee address on the form of "akka.tcp://system@hostname:port" - address = "" - down-all-if-less-than-nodes = 1 - } + # This delay is used on the minority side before trying to acquire the lease, + # as an best effort to try to keep the majority side. + acquire-lease-delay-for-minority = 2s + + # If the 'role' is defined the majority/minority is based only on members with that 'role'. + role = "" } ``` -## Relation to Cluster Singleton and Cluster Sharding +A `Lease` is a type of distributed lock implementation. + +> [!NOTE] +> We are working on improving the documentation for Akka.Coordination, which will shed some more light on how this feature can be used in the future. + +### Relation to Cluster Singleton and Cluster Sharding Cluster singleton actors and sharded entities of cluster sharding have their lifecycle managed automatically. This means that there can be only one instance of a target actor at the same time in the cluster, and when detected dead, it will be resurrected on another node. However it's important the the old instance of the actor must be stopped before new one will be spawned, especially when used together will Akka.Persistence module. Otherwise this may result in corruption of actor's persistent state and violate actor state consistency. Since different nodes may apply their split brain decisions at different points in time, it may be good to configure a time margin necessary to make sure, that other nodes will get enough time to apply their strategies. This can be done using `akka.cluster.down-removal-margin` setting. The shorter it is, the faster reaction time of your cluster will be. However it will also increase the risk of having multiple singleton/sharded entity instances at the same time. It's recommended to set this value to be equal `stable-after` option described above. -### Expected Failover Time +#### Expected Failover Time for Shards and Singletons If you're going to use a split brain resolver, you can see that the total failover latency is determined by several values. Defaults are: diff --git a/src/core/Akka.Cluster.Tests/SBR/SplitBrainResolverSpec.cs b/src/core/Akka.Cluster.Tests/SBR/SplitBrainResolverSpec.cs index 61fa0ff6909..ca120bdcd37 100644 --- a/src/core/Akka.Cluster.Tests/SBR/SplitBrainResolverSpec.cs +++ b/src/core/Akka.Cluster.Tests/SBR/SplitBrainResolverSpec.cs @@ -1,4 +1,11 @@ -using System; +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2021 Lightbend Inc. +// Copyright (C) 2013-2021 .NET Foundation +// +//----------------------------------------------------------------------- + +using System; using System.Collections.Generic; using System.Linq; using System.Text; diff --git a/src/core/Akka.Cluster/Configuration/Cluster.conf b/src/core/Akka.Cluster/Configuration/Cluster.conf index 4a91613c058..9a083ea7036 100644 --- a/src/core/Akka.Cluster/Configuration/Cluster.conf +++ b/src/core/Akka.Cluster/Configuration/Cluster.conf @@ -309,12 +309,6 @@ akka.cluster.split-brain-resolver { # keep-referee - supported only with the old split brain resolver active-strategy = keep-majority - # Time margin after which shards or singletons that belonged to a downed/removed - # partition are created in surviving partition. The purpose of this margin is that - # in case of a network partition the persistent actors in the non-surviving partitions - # must be stopped before corresponding persistent actors are started somewhere else. - # This is useful if you implement downing strategies that handle network partitions, - # e.g. by keeping the larger side of the partition and shutting down the smaller side. # Decision is taken by the strategy when there has been no membership or # reachability changes for this duration, i.e. the cluster state is stable. stable-after = 20s diff --git a/src/core/Akka.Cluster/SBR/DowningStrategy.cs b/src/core/Akka.Cluster/SBR/DowningStrategy.cs index 8f9dcba81fd..5b0834e5af5 100644 --- a/src/core/Akka.Cluster/SBR/DowningStrategy.cs +++ b/src/core/Akka.Cluster/SBR/DowningStrategy.cs @@ -1,8 +1,14 @@ -using System; +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2021 Lightbend Inc. +// Copyright (C) 2013-2021 .NET Foundation +// +//----------------------------------------------------------------------- + +using System; using System.Collections.Generic; using System.Collections.Immutable; using System.Linq; -using System.Text; using Akka.Actor; using Akka.Coordination; @@ -16,44 +22,45 @@ internal interface IDecision internal class DownReachable : IDecision { public static readonly DownReachable Instance = new DownReachable(); - public bool IsIndirectlyConnected => false; private DownReachable() { } + + public bool IsIndirectlyConnected => false; } internal class DownUnreachable : IDecision { public static readonly DownUnreachable Instance = new DownUnreachable(); - public bool IsIndirectlyConnected => false; - private DownUnreachable() { } + + public bool IsIndirectlyConnected => false; } internal class DownAll : IDecision { public static readonly DownAll Instance = new DownAll(); - public bool IsIndirectlyConnected => false; - private DownAll() { } + + public bool IsIndirectlyConnected => false; } internal class DownIndirectlyConnected : IDecision { public static readonly DownIndirectlyConnected Instance = new DownIndirectlyConnected(); - public bool IsIndirectlyConnected => true; - private DownIndirectlyConnected() { } + + public bool IsIndirectlyConnected => true; } internal interface IAcquireLeaseDecision : IDecision @@ -90,7 +97,8 @@ public override int GetHashCode() } } - internal class AcquireLeaseAndDownIndirectlyConnected : IAcquireLeaseDecision, IEquatable + internal class AcquireLeaseAndDownIndirectlyConnected : IAcquireLeaseDecision, + IEquatable { public AcquireLeaseAndDownIndirectlyConnected(TimeSpan acquireDelay) { @@ -134,93 +142,186 @@ protected DowningStrategy() protected IComparer Ordering { get; set; } = Member.Ordering; // may contain Joining and WeaklyUp - public ImmutableHashSet Unreachable { get; private set; } = ImmutableHashSet.Empty; - - public bool IsUnreachable(Member m) => Unreachable.Contains(m.UniqueAddress); + public ImmutableHashSet Unreachable { get; private set; } = + ImmutableHashSet.Empty; public string Role { get; protected set; } // all Joining and WeaklyUp members public ImmutableSortedSet Joining => - AllMembers.Where(m => m.Status == MemberStatus.Joining || m.Status == MemberStatus.WeaklyUp).ToImmutableSortedSet(Ordering); + AllMembers.Where(m => m.Status == MemberStatus.Joining || m.Status == MemberStatus.WeaklyUp) + .ToImmutableSortedSet(Ordering); // all members, both joining and up. public ImmutableSortedSet AllMembers { get; private set; } //All members, but doesn't contain Joining, WeaklyUp, Down and Exiting. public ImmutableSortedSet Members => - GetMembers(includingPossiblyUp: false, excludingPossiblyExiting: false); + GetMembers(false, false); + + public ImmutableSortedSet MembersWithRole => + GetMembersWithRole(false, false); + + public ImmutableSortedSet ReachableMembers => + GetReachableMembers(false, false); + + public ImmutableSortedSet ReachableMembersWithRole => + GetReachableMembersWithRole(false, false); + + public ImmutableSortedSet UnreachableMembers => + GetUnreachableMembers(false, false); + + public ImmutableSortedSet UnreachableMembersWithRole => + GetUnreachableMembersWithRole(false, false); + + public Reachability Reachability { get; private set; } = Reachability.Empty; + + public ImmutableHashSet
SeenBy { get; private set; } = ImmutableHashSet
.Empty; /// - /// All members in self DC, but doesn't contain Joining, WeaklyUp, Down and Exiting. - /// - /// When `includingPossiblyUp=true` it also includes Joining and WeaklyUp members that could have been - /// changed to Up on the other side of a partition. - /// - /// When `excludingPossiblyExiting=true` it doesn't include Leaving members that could have been - /// changed to Exiting on the other side of the partition. + /// Nodes that are marked as unreachable but can communicate with gossip via a 3rd party. + /// Cycle in unreachability graph corresponds to that some node is both + /// observing another node as unreachable, and is also observed as unreachable by someone + /// else. + /// Another indication of indirectly connected nodes is if a node is marked as unreachable, + /// but it has still marked current gossip state as seen. + /// Those cases will not happen for clean splits and crashed nodes. + /// + public ImmutableHashSet IndirectlyConnected => + IndirectlyConnectedFromIntersectionOfObserversAndSubjects.Union(IndirectlyConnectedFromSeenCurrentGossip); + + private ImmutableHashSet IndirectlyConnectedFromIntersectionOfObserversAndSubjects + { + get + { + // cycle in unreachability graph + var observers = Reachability.AllObservers; + return observers.Intersect(Reachability.AllUnreachableOrTerminated); + } + } + + private ImmutableHashSet IndirectlyConnectedFromSeenCurrentGossip => + Reachability.Records.SelectMany(r => + { + if (SeenBy.Contains(r.Subject.Address)) + return new[] { r.Observer, r.Subject }; + return Array.Empty(); + }).ToImmutableHashSet(); + + public bool HasIndirectlyConnected => !IndirectlyConnected.IsEmpty; + + public ImmutableHashSet UnreachableButNotIndirectlyConnected => + Unreachable.Except(IndirectlyConnected); + + private ImmutableHashSet AdditionalNodesToDownWhenIndirectlyConnected + { + get + { + if (UnreachableButNotIndirectlyConnected.IsEmpty) return ImmutableHashSet.Empty; + + var originalUnreachable = Unreachable; + var originalReachability = Reachability; + try + { + var intersectionOfObserversAndSubjects = IndirectlyConnectedFromIntersectionOfObserversAndSubjects; + var haveSeenCurrentGossip = IndirectlyConnectedFromSeenCurrentGossip; + // remove records between the indirectly connected + Reachability = Reachability.FilterRecords( + r => + !(intersectionOfObserversAndSubjects.Contains(r.Observer) && + intersectionOfObserversAndSubjects.Contains(r.Subject) || + haveSeenCurrentGossip.Contains(r.Observer) && haveSeenCurrentGossip.Contains(r.Subject))); + Unreachable = Reachability.AllUnreachableOrTerminated; + var additionalDecision = Decide(); + + if (additionalDecision.IsIndirectlyConnected) + throw new InvalidOperationException( + $"SBR double {additionalDecision} decision, downing all instead. " + + $"originalReachability: [{originalReachability}], filtered reachability [{Reachability}], " + + $"still indirectlyConnected: [{string.Join(", ", IndirectlyConnected)}], seenBy: [{string.Join(", ", SeenBy)}]" + ); + + return NodesToDown(additionalDecision); + } + finally + { + Unreachable = originalUnreachable; + Reachability = originalReachability; + } + } + } + + public bool IsAllUnreachableDownOrExiting => + Unreachable.IsEmpty || + UnreachableMembers.All(m => m.Status == MemberStatus.Down || m.Status == MemberStatus.Exiting); + + public Lease Lease { get; protected set; } + + public bool IsUnreachable(Member m) + { + return Unreachable.Contains(m.UniqueAddress); + } + + /// + /// All members in self DC, but doesn't contain Joining, WeaklyUp, Down and Exiting. + /// When `includingPossiblyUp=true` it also includes Joining and WeaklyUp members that could have been + /// changed to Up on the other side of a partition. + /// When `excludingPossiblyExiting=true` it doesn't include Leaving members that could have been + /// changed to Exiting on the other side of the partition. /// /// /// /// - public ImmutableSortedSet GetMembers(bool includingPossiblyUp, bool excludingPossiblyExiting) => - AllMembers.Where(m => !((!includingPossiblyUp && m.Status == MemberStatus.Joining) || - (!includingPossiblyUp && m.Status == MemberStatus.WeaklyUp) || - (excludingPossiblyExiting && m.Status == MemberStatus.Leaving) || - m.Status == MemberStatus.Down || - m.Status == MemberStatus.Exiting) + public ImmutableSortedSet GetMembers(bool includingPossiblyUp, bool excludingPossiblyExiting) + { + return AllMembers.Where(m => !(!includingPossiblyUp && m.Status == MemberStatus.Joining || + !includingPossiblyUp && m.Status == MemberStatus.WeaklyUp || + excludingPossiblyExiting && m.Status == MemberStatus.Leaving || + m.Status == MemberStatus.Down || + m.Status == MemberStatus.Exiting) ).ToImmutableSortedSet(Ordering); - - public ImmutableSortedSet MembersWithRole => - GetMembersWithRole(includingPossiblyUp: false, excludingPossiblyExiting: false); + } public ImmutableSortedSet GetMembersWithRole(bool includingPossiblyUp, bool excludingPossiblyExiting) { if (string.IsNullOrEmpty(Role)) return GetMembers(includingPossiblyUp, excludingPossiblyExiting); - return GetMembers(includingPossiblyUp, excludingPossiblyExiting).Where(m => m.HasRole(Role)).ToImmutableSortedSet(Ordering); + return GetMembers(includingPossiblyUp, excludingPossiblyExiting).Where(m => m.HasRole(Role)) + .ToImmutableSortedSet(Ordering); } - public ImmutableSortedSet ReachableMembers => - GetReachableMembers(includingPossiblyUp: false, excludingPossiblyExiting: false); - public ImmutableSortedSet GetReachableMembers(bool includingPossiblyUp, bool excludingPossiblyExiting) { var mbrs = GetMembers(includingPossiblyUp, excludingPossiblyExiting); if (Unreachable.IsEmpty) return mbrs; - else - return mbrs.Where(m => !IsUnreachable(m)).ToImmutableSortedSet(Ordering); + return mbrs.Where(m => !IsUnreachable(m)).ToImmutableSortedSet(Ordering); } - public ImmutableSortedSet ReachableMembersWithRole => - GetReachableMembersWithRole(includingPossiblyUp: false, excludingPossiblyExiting: false); - - public ImmutableSortedSet GetReachableMembersWithRole(bool includingPossiblyUp, bool excludingPossiblyExiting) + public ImmutableSortedSet GetReachableMembersWithRole(bool includingPossiblyUp, + bool excludingPossiblyExiting) { if (string.IsNullOrEmpty(Role)) return GetReachableMembers(includingPossiblyUp, excludingPossiblyExiting); - return GetReachableMembers(includingPossiblyUp, excludingPossiblyExiting).Where(m => m.HasRole(Role)).ToImmutableSortedSet(Ordering); + return GetReachableMembers(includingPossiblyUp, excludingPossiblyExiting).Where(m => m.HasRole(Role)) + .ToImmutableSortedSet(Ordering); } - public ImmutableSortedSet UnreachableMembers => - GetUnreachableMembers(includingPossiblyUp: false, excludingPossiblyExiting: false); - public ImmutableSortedSet GetUnreachableMembers(bool includingPossiblyUp, bool excludingPossiblyExiting) { if (Unreachable.IsEmpty) return ImmutableSortedSet.Empty; - return GetMembers(includingPossiblyUp, excludingPossiblyExiting).Where(IsUnreachable).ToImmutableSortedSet(Ordering); + return GetMembers(includingPossiblyUp, excludingPossiblyExiting).Where(IsUnreachable) + .ToImmutableSortedSet(Ordering); } - public ImmutableSortedSet UnreachableMembersWithRole => - GetUnreachableMembersWithRole(includingPossiblyUp: false, excludingPossiblyExiting: false); - - public ImmutableSortedSet GetUnreachableMembersWithRole(bool includingPossiblyUp, bool excludingPossiblyExiting) + public ImmutableSortedSet GetUnreachableMembersWithRole(bool includingPossiblyUp, + bool excludingPossiblyExiting) { if (string.IsNullOrEmpty(Role)) return GetUnreachableMembers(includingPossiblyUp, excludingPossiblyExiting); - return GetUnreachableMembers(includingPossiblyUp, excludingPossiblyExiting).Where(m => m.HasRole(Role)).ToImmutableSortedSet(Ordering); + return GetUnreachableMembers(includingPossiblyUp, excludingPossiblyExiting).Where(m => m.HasRole(Role)) + .ToImmutableSortedSet(Ordering); } public void AddUnreachable(Member m) @@ -250,91 +351,37 @@ public void Remove(Member m) private void RemoveFromAllMembers(Member m) { if (ReferenceEquals(Ordering, Member.Ordering)) - { AllMembers = AllMembers.Remove(m); - } else - { // must use filterNot for removals/replace in the SortedSet when // ageOrdering is using upNumber and that will change when Joining -> Up - AllMembers = AllMembers.Where(i => !i.UniqueAddress.Equals(m.UniqueAddress)).ToImmutableSortedSet(Ordering); - } + AllMembers = AllMembers.Where(i => !i.UniqueAddress.Equals(m.UniqueAddress)) + .ToImmutableSortedSet(Ordering); } - public Reachability Reachability { get; private set; } = Reachability.Empty; - public void SetReachability(Reachability r) { // skip records with Reachability.Reachable, and skip records related to other DC Reachability = r.FilterRecords(record => - (record.Status == Reachability.ReachabilityStatus.Unreachable || record.Status == Reachability.ReachabilityStatus.Terminated) - ); + record.Status == Reachability.ReachabilityStatus.Unreachable || + record.Status == Reachability.ReachabilityStatus.Terminated + ); } - public ImmutableHashSet
SeenBy { get; private set; } = ImmutableHashSet
.Empty; - public void SetSeenBy(ImmutableHashSet
s) { SeenBy = s; } - /// - /// Nodes that are marked as unreachable but can communicate with gossip via a 3rd party. - /// - /// Cycle in unreachability graph corresponds to that some node is both - /// observing another node as unreachable, and is also observed as unreachable by someone - /// else. - /// - /// Another indication of indirectly connected nodes is if a node is marked as unreachable, - /// but it has still marked current gossip state as seen. - /// - /// Those cases will not happen for clean splits and crashed nodes. - /// - public ImmutableHashSet IndirectlyConnected - { - get - { - return IndirectlyConnectedFromIntersectionOfObserversAndSubjects.Union(IndirectlyConnectedFromSeenCurrentGossip); - } - } - - private ImmutableHashSet IndirectlyConnectedFromIntersectionOfObserversAndSubjects - { - get - { - // cycle in unreachability graph - var observers = Reachability.AllObservers; - return observers.Intersect(Reachability.AllUnreachableOrTerminated); - } - } - - private ImmutableHashSet IndirectlyConnectedFromSeenCurrentGossip - { - get - { - return Reachability.Records.SelectMany(r => - { - if (SeenBy.Contains(r.Subject.Address)) - return new[] { r.Observer, r.Subject }; - else - return Array.Empty(); - }).ToImmutableHashSet(); - } - } - - public bool HasIndirectlyConnected => !IndirectlyConnected.IsEmpty; - - public ImmutableHashSet UnreachableButNotIndirectlyConnected => Unreachable.Except(IndirectlyConnected); - public ImmutableHashSet NodesToDown(IDecision decision = null) { decision = decision ?? Decide(); var downable = Members - .Union(Joining) - .Where(m => m.Status != MemberStatus.Down && m.Status != MemberStatus.Exiting) - .Select(m => m.UniqueAddress) - .ToImmutableHashSet(); + .Union(Joining) + .Where(m => m.Status != MemberStatus.Down && m.Status != MemberStatus.Exiting) + .Select(m => m.UniqueAddress) + .ToImmutableHashSet(); switch (decision) { @@ -364,54 +411,6 @@ public ImmutableHashSet NodesToDown(IDecision decision = null) throw new InvalidOperationException(); } - private ImmutableHashSet AdditionalNodesToDownWhenIndirectlyConnected - { - get - { - if (UnreachableButNotIndirectlyConnected.IsEmpty) - return ImmutableHashSet.Empty; - else - { - var originalUnreachable = Unreachable; - var originalReachability = Reachability; - try - { - var intersectionOfObserversAndSubjects = IndirectlyConnectedFromIntersectionOfObserversAndSubjects; - var haveSeenCurrentGossip = IndirectlyConnectedFromSeenCurrentGossip; - // remove records between the indirectly connected - Reachability = Reachability.FilterRecords( - r => - !((intersectionOfObserversAndSubjects.Contains(r.Observer) && intersectionOfObserversAndSubjects.Contains(r.Subject)) || - (haveSeenCurrentGossip.Contains(r.Observer) && haveSeenCurrentGossip.Contains(r.Subject)))); - Unreachable = Reachability.AllUnreachableOrTerminated; - var additionalDecision = Decide(); - - if (additionalDecision.IsIndirectlyConnected) - throw new InvalidOperationException($"SBR double {additionalDecision} decision, downing all instead. " + - $"originalReachability: [{originalReachability}], filtered reachability [{Reachability}], " + - $"still indirectlyConnected: [{string.Join(", ", IndirectlyConnected)}], seenBy: [{string.Join(", ", SeenBy)}]" - ); - - return NodesToDown(additionalDecision); - } - finally - { - Unreachable = originalUnreachable; - Reachability = originalReachability; - } - } - } - } - - public bool IsAllUnreachableDownOrExiting - { - get - { - return Unreachable.IsEmpty || - UnreachableMembers.All(m => m.Status == MemberStatus.Down || m.Status == MemberStatus.Exiting); - } - } - public IDecision ReverseDecision(IDecision decision) { switch (decision) @@ -431,31 +430,27 @@ public IDecision ReverseDecision(IDecision decision) case ReverseDownIndirectlyConnected _: return DownIndirectlyConnected.Instance; } + throw new InvalidOperationException(); } public abstract IDecision Decide(); - - public Lease Lease { get; protected set; } } /// - /// Down the unreachable nodes if the number of remaining nodes are greater than or equal to the - /// given `quorumSize`. Otherwise down the reachable nodes, i.e. it will shut down that side of the partition. - /// In other words, the `quorumSize` defines the minimum number of nodes that the cluster must have to be operational. - /// If there are unreachable nodes when starting up the cluster, before reaching this limit, - /// the cluster may shutdown itself immediately. This is not an issue if you start all nodes at - /// approximately the same time. - /// - /// Note that you must not add more members to the cluster than `quorumSize * 2 - 1`, because then - /// both sides may down each other and thereby form two separate clusters. For example, - /// quorum quorumSize configured to 3 in a 6 node cluster may result in a split where each side - /// consists of 3 nodes each, i.e. each side thinks it has enough nodes to continue by - /// itself. A warning is logged if this recommendation is violated. - /// - /// If the `role` is defined the decision is based only on members with that `role`. - /// - /// It is only counting members within the own data center. + /// Down the unreachable nodes if the number of remaining nodes are greater than or equal to the + /// given `quorumSize`. Otherwise down the reachable nodes, i.e. it will shut down that side of the partition. + /// In other words, the `quorumSize` defines the minimum number of nodes that the cluster must have to be operational. + /// If there are unreachable nodes when starting up the cluster, before reaching this limit, + /// the cluster may shutdown itself immediately. This is not an issue if you start all nodes at + /// approximately the same time. + /// Note that you must not add more members to the cluster than `quorumSize * 2 - 1`, because then + /// both sides may down each other and thereby form two separate clusters. For example, + /// quorum quorumSize configured to 3 in a 6 node cluster may result in a split where each side + /// consists of 3 nodes each, i.e. each side thinks it has enough nodes to continue by + /// itself. A warning is logged if this recommendation is violated. + /// If the `role` is defined the decision is based only on members with that `role`. + /// It is only counting members within the own data center. /// internal class StaticQuorum : DowningStrategy { @@ -467,33 +462,29 @@ public StaticQuorum(int quorumSize, string role) public int QuorumSize { get; } + public bool IsTooManyMembers => + MembersWithRole.Count > QuorumSize * 2 - 1; + public override IDecision Decide() { if (IsTooManyMembers) return DownAll.Instance; - else if (HasIndirectlyConnected) + if (HasIndirectlyConnected) return DownIndirectlyConnected.Instance; - else if (MembersWithRole.Count - UnreachableMembersWithRole.Count >= QuorumSize) + if (MembersWithRole.Count - UnreachableMembersWithRole.Count >= QuorumSize) return DownUnreachable.Instance; - else - return DownReachable.Instance; + return DownReachable.Instance; } - - public bool IsTooManyMembers => - MembersWithRole.Count > (QuorumSize * 2 - 1); } /// - /// Down the unreachable nodes if the current node is in the majority part based the last known - /// membership information. Otherwise down the reachable nodes, i.e. the own part. If the the - /// parts are of equal size the part containing the node with the lowest address is kept. - /// - /// If the `role` is defined the decision is based only on members with that `role`. - /// - /// Note that if there are more than two partitions and none is in majority each part - /// will shutdown itself, terminating the whole cluster. - /// - /// It is only counting members within the own data center. + /// Down the unreachable nodes if the current node is in the majority part based the last known + /// membership information. Otherwise down the reachable nodes, i.e. the own part. If the the + /// parts are of equal size the part containing the node with the lowest address is kept. + /// If the `role` is defined the decision is based only on members with that `role`. + /// Note that if there are more than two partitions and none is in majority each part + /// will shutdown itself, terminating the whole cluster. + /// It is only counting members within the own data center. /// internal class KeepMajority : DowningStrategy { @@ -504,36 +495,29 @@ public KeepMajority(string role) public override IDecision Decide() { - if (HasIndirectlyConnected) - return DownIndirectlyConnected.Instance; - else - { - var ms = MembersWithRole; - if (ms.IsEmpty) - return DownAll.Instance; // no node with matching role - else - { - var reachableSize = ReachableMembersWithRole.Count; - var unreachableSize = UnreachableMembersWithRole.Count; + if (HasIndirectlyConnected) return DownIndirectlyConnected.Instance; + + var ms = MembersWithRole; + if (ms.IsEmpty) return DownAll.Instance; // no node with matching role - var decision = MajorityDecision(reachableSize, unreachableSize, ms.FirstOrDefault()); + var reachableSize = ReachableMembersWithRole.Count; + var unreachableSize = UnreachableMembersWithRole.Count; - switch (decision) + var decision = MajorityDecision(reachableSize, unreachableSize, ms.FirstOrDefault()); + + switch (decision) + { + case DownUnreachable _: + var decision2 = MajorityDecisionWhenIncludingMembershipChangesEdgeCase(); + switch (decision2) { case DownUnreachable _: - var decision2 = MajorityDecisionWhenIncludingMembershipChangesEdgeCase(); - switch (decision2) - { - case DownUnreachable _: - return DownUnreachable.Instance; // same conclusion - default: - return DownAll.Instance; // different conclusion, safest to DownAll - } + return DownUnreachable.Instance; // same conclusion default: - return decision; + return DownAll.Instance; // different conclusion, safest to DownAll } - - } + default: + return decision; } } @@ -544,65 +528,50 @@ private IDecision MajorityDecision(int thisSide, int otherSide, Member lowest) // equal size, keep the side with the lowest address (first in members) if (IsUnreachable(lowest)) return DownReachable.Instance; - else - return DownUnreachable.Instance; + return DownUnreachable.Instance; } - else if (thisSide > otherSide) - { + + if (thisSide > otherSide) // we are in majority return DownUnreachable.Instance; - } - else - { - // we are in minority - return DownReachable.Instance; - } + return DownReachable.Instance; } /// - /// Check for edge case when membership change happens at the same time as partition. - /// Count Joining and WeaklyUp on other side since those might be Up on other side. - /// Don't count Leaving on this side since those might be Exiting on other side. - /// Note that the membership changes we are looking for will only be done when all - /// members have seen previous state, i.e. when a member is moved to Up everybody - /// has seen it joining. + /// Check for edge case when membership change happens at the same time as partition. + /// Count Joining and WeaklyUp on other side since those might be Up on other side. + /// Don't count Leaving on this side since those might be Exiting on other side. + /// Note that the membership changes we are looking for will only be done when all + /// members have seen previous state, i.e. when a member is moved to Up everybody + /// has seen it joining. /// /// private IDecision MajorityDecisionWhenIncludingMembershipChangesEdgeCase() { // for this side we count as few as could be possible (excluding joining, excluding leaving) - var ms = GetMembersWithRole(includingPossiblyUp: false, excludingPossiblyExiting: true); - if (ms.IsEmpty) - { - return DownAll.Instance; - } - else - { - var thisSideReachableSize = GetReachableMembersWithRole(includingPossiblyUp: false, excludingPossiblyExiting: true).Count; - // for other side we count as many as could be possible (including joining, including leaving) - var otherSideUnreachableSize = GetUnreachableMembersWithRole(includingPossiblyUp: true, excludingPossiblyExiting: false).Count; - return MajorityDecision(thisSideReachableSize, otherSideUnreachableSize, ms.FirstOrDefault()); - } + var ms = GetMembersWithRole(false, true); + if (ms.IsEmpty) return DownAll.Instance; + + var thisSideReachableSize = GetReachableMembersWithRole(false, true).Count; + // for other side we count as many as could be possible (including joining, including leaving) + var otherSideUnreachableSize = GetUnreachableMembersWithRole(true, false).Count; + return MajorityDecision(thisSideReachableSize, otherSideUnreachableSize, ms.FirstOrDefault()); } } /// - /// Down the part that does not contain the oldest member (current singleton). - /// - /// There is one exception to this rule if `downIfAlone` is defined to `true`. - /// Then, if the oldest node has partitioned from all other nodes the oldest will - /// down itself and keep all other nodes running. The strategy will not down the - /// single oldest node when it is the only remaining node in the cluster. - /// - /// Note that if the oldest node crashes the others will remove it from the cluster - /// when `downIfAlone` is `true`, otherwise they will down themselves if the - /// oldest node crashes, i.e. shutdown the whole cluster together with the oldest node. - /// - /// If the `role` is defined the decision is based only on members with that `role`, - /// i.e. using the oldest member (singleton) within the nodes with that role. - /// - /// It is only using members within the own data center, i.e. oldest within the - /// data center. + /// Down the part that does not contain the oldest member (current singleton). + /// There is one exception to this rule if `downIfAlone` is defined to `true`. + /// Then, if the oldest node has partitioned from all other nodes the oldest will + /// down itself and keep all other nodes running. The strategy will not down the + /// single oldest node when it is the only remaining node in the cluster. + /// Note that if the oldest node crashes the others will remove it from the cluster + /// when `downIfAlone` is `true`, otherwise they will down themselves if the + /// oldest node crashes, i.e. shutdown the whole cluster together with the oldest node. + /// If the `role` is defined the decision is based only on members with that `role`, + /// i.e. using the oldest member (singleton) within the nodes with that role. + /// It is only using members within the own data center, i.e. oldest within the + /// data center. /// internal class KeepOldest : DowningStrategy { @@ -619,36 +588,30 @@ public KeepOldest(bool downIfAlone, string role) public override IDecision Decide() { - if (HasIndirectlyConnected) - return DownIndirectlyConnected.Instance; - else - { - var ms = MembersWithRole; - if (ms.IsEmpty) - return DownAll.Instance; // no node with matching role - else - { - var oldest = ms.FirstOrDefault(); - var oldestIsReachable = !IsUnreachable(oldest); - var reachableCount = ReachableMembersWithRole.Count; - var unreachableCount = UnreachableMembersWithRole.Count; + if (HasIndirectlyConnected) return DownIndirectlyConnected.Instance; + + var ms = MembersWithRole; + if (ms.IsEmpty) return DownAll.Instance; // no node with matching role - var decision = OldestDecision(oldestIsReachable, reachableCount, unreachableCount); - switch (decision) + var oldest = ms.FirstOrDefault(); + var oldestIsReachable = !IsUnreachable(oldest); + var reachableCount = ReachableMembersWithRole.Count; + var unreachableCount = UnreachableMembersWithRole.Count; + + var decision = OldestDecision(oldestIsReachable, reachableCount, unreachableCount); + switch (decision) + { + case DownUnreachable _: + var decision2 = OldestDecisionWhenIncludingMembershipChangesEdgeCase(); + switch (decision2) { case DownUnreachable _: - var decision2 = OldestDecisionWhenIncludingMembershipChangesEdgeCase(); - switch (decision2) - { - case DownUnreachable _: - return DownUnreachable.Instance; // same conclusion - default: - return DownAll.Instance; // different conclusion, safest to DownAll - } + return DownUnreachable.Instance; // same conclusion default: - return decision; + return DownAll.Instance; // different conclusion, safest to DownAll } - } + default: + return decision; } } @@ -663,56 +626,42 @@ private IDecision OldestDecision(bool oldestIsOnThisSide, int thisSide, int othe // oldest because it is alone if (DownIfAlone && thisSide == 1 && otherSide >= 2) return DownReachable.Instance; - else - return DownUnreachable.Instance; - } - else - { - if (DownIfAlone && otherSide == 1 && thisSide >= 2) - return DownUnreachable.Instance; - else - return DownReachable.Instance; + return DownUnreachable.Instance; } + + if (DownIfAlone && otherSide == 1 && thisSide >= 2) + return DownUnreachable.Instance; + return DownReachable.Instance; } /// - /// Check for edge case when membership change happens at the same time as partition. - /// Exclude Leaving on this side because those could be Exiting on other side. - /// - /// When `downIfAlone` also consider Joining and WeaklyUp since those might be Up on other side, - /// and thereby flip the alone test. + /// Check for edge case when membership change happens at the same time as partition. + /// Exclude Leaving on this side because those could be Exiting on other side. + /// When `downIfAlone` also consider Joining and WeaklyUp since those might be Up on other side, + /// and thereby flip the alone test. /// /// private IDecision OldestDecisionWhenIncludingMembershipChangesEdgeCase() { - var ms = GetMembersWithRole(includingPossiblyUp: false, excludingPossiblyExiting: true); - if (ms.IsEmpty) - { - return DownAll.Instance; - } - else - { - var oldest = ms.First(); - var oldestIsReachable = !IsUnreachable(oldest); - // Joining and WeaklyUp are only relevant when downIfAlone = true - var includingPossiblyUp = DownIfAlone; - var reachableCount = GetReachableMembersWithRole(includingPossiblyUp, excludingPossiblyExiting: true).Count; - var unreachableCount = GetUnreachableMembersWithRole(includingPossiblyUp, excludingPossiblyExiting: true).Count; - - return OldestDecision(oldestIsReachable, reachableCount, unreachableCount); - } + var ms = GetMembersWithRole(false, true); + if (ms.IsEmpty) return DownAll.Instance; + + var oldest = ms.First(); + var oldestIsReachable = !IsUnreachable(oldest); + // Joining and WeaklyUp are only relevant when downIfAlone = true + var includingPossiblyUp = DownIfAlone; + var reachableCount = GetReachableMembersWithRole(includingPossiblyUp, true).Count; + var unreachableCount = GetUnreachableMembersWithRole(includingPossiblyUp, true).Count; + + return OldestDecision(oldestIsReachable, reachableCount, unreachableCount); } } /// - /// Down all nodes unconditionally. + /// Down all nodes unconditionally. /// internal class DownAllNodes : DowningStrategy { - public DownAllNodes() - { - } - public override IDecision Decide() { return DownAll.Instance; @@ -720,14 +669,12 @@ public override IDecision Decide() } /// - /// Keep the part that can acquire the lease, and down the other part. - /// - /// Best effort is to keep the side that has most nodes, i.e. the majority side. - /// This is achieved by adding a delay before trying to acquire the lease on the - /// minority side. - /// - /// If the `role` is defined the majority/minority is based only on members with that `role`. - /// It is only counting members within the own data center. + /// Keep the part that can acquire the lease, and down the other part. + /// Best effort is to keep the side that has most nodes, i.e. the majority side. + /// This is achieved by adding a delay before trying to acquire the lease on the + /// minority side. + /// If the `role` is defined the majority/minority is based only on members with that `role`. + /// It is only counting members within the own data center. /// internal class LeaseMajority : DowningStrategy { @@ -740,22 +687,13 @@ public LeaseMajority(string role, Lease lease, TimeSpan acquireLeaseDelayForMino public TimeSpan AcquireLeaseDelayForMinority { get; } - public override IDecision Decide() - { - if (HasIndirectlyConnected) - return new AcquireLeaseAndDownIndirectlyConnected(TimeSpan.Zero); - else - return new AcquireLeaseAndDownUnreachable(AcquireLeaseDelay); - } - private TimeSpan AcquireLeaseDelay { get { if (IsInMinority) return AcquireLeaseDelayForMinority; - else - return TimeSpan.Zero; + return TimeSpan.Zero; } } @@ -764,30 +702,26 @@ private bool IsInMinority get { var ms = MembersWithRole; - if (ms.IsEmpty) - return false; // no node with matching role - else - { - var unreachableSize = UnreachableMembersWithRole.Count; - var membersSize = ms.Count; - - if (unreachableSize * 2 == membersSize) - { - // equal size, try to keep the side with the lowest address (first in members) - return IsUnreachable(ms.FirstOrDefault()); - } - else if (unreachableSize * 2 < membersSize) - { - // we are in majority - return false; - } - else - { - // we are in minority - return true; - } - } + if (ms.IsEmpty) return false; // no node with matching role + + var unreachableSize = UnreachableMembersWithRole.Count; + var membersSize = ms.Count; + + if (unreachableSize * 2 == membersSize) + // equal size, try to keep the side with the lowest address (first in members) + return IsUnreachable(ms.FirstOrDefault()); + if (unreachableSize * 2 < membersSize) + // we are in majority + return false; + return true; } } + + public override IDecision Decide() + { + if (HasIndirectlyConnected) + return new AcquireLeaseAndDownIndirectlyConnected(TimeSpan.Zero); + return new AcquireLeaseAndDownUnreachable(AcquireLeaseDelay); + } } } \ No newline at end of file diff --git a/src/core/Akka.Cluster/SBR/SplitBrainResolver.cs b/src/core/Akka.Cluster/SBR/SplitBrainResolver.cs index 4ebc379372e..22f8208824c 100644 --- a/src/core/Akka.Cluster/SBR/SplitBrainResolver.cs +++ b/src/core/Akka.Cluster/SBR/SplitBrainResolver.cs @@ -1,8 +1,14 @@ -using System; -using System.Collections.Generic; +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2021 Lightbend Inc. +// Copyright (C) 2013-2021 .NET Foundation +// +//----------------------------------------------------------------------- + +using System; using System.Collections.Immutable; +using System.Globalization; using System.Linq; -using System.Text; using Akka.Actor; using Akka.Coordination; using Akka.Event; @@ -12,19 +18,13 @@ namespace Akka.Cluster.SBR { /// - /// Unreachable members will be downed by this actor according to the given strategy. - /// It is active on the leader node in the cluster. - /// - /// The implementation is split into two classes SplitBrainResolver and SplitBrainResolverBase to be - /// able to unit test the logic without running cluster. + /// Unreachable members will be downed by this actor according to the given strategy. + /// It is active on the leader node in the cluster. + /// The implementation is split into two classes SplitBrainResolver and SplitBrainResolverBase to be + /// able to unit test the logic without running cluster. /// internal class SplitBrainResolver : SplitBrainResolverBase { - public static Props Props2(TimeSpan stableAfter, DowningStrategy strategy) - { - return Props.Create(() => new SplitBrainResolver(stableAfter, strategy)); - } - private readonly Cluster cluster; public SplitBrainResolver(TimeSpan stableAfter, DowningStrategy strategy) @@ -35,16 +35,23 @@ public SplitBrainResolver(TimeSpan stableAfter, DowningStrategy strategy) "SBR started. Config: strategy [{0}], stable-after [{1}], down-all-when-unstable [{2}], selfUniqueAddress [{3}].", Logging.SimpleName(strategy.GetType()), stableAfter, + // ReSharper disable VirtualMemberCallInConstructor DownAllWhenUnstable == TimeSpan.Zero ? "off" : DownAllWhenUnstable.ToString(), SelfUniqueAddress.Address); + // ReSharper restore VirtualMemberCallInConstructor } public override UniqueAddress SelfUniqueAddress => cluster.SelfUniqueAddress; + public static Props Props2(TimeSpan stableAfter, DowningStrategy strategy) + { + return Props.Create(() => new SplitBrainResolver(stableAfter, strategy)); + } + // re-subscribe when restart protected override void PreStart() { - cluster.Subscribe(Self, ClusterEvent.InitialStateAsEvents, typeof(IClusterDomainEvent)); + cluster.Subscribe(Self, InitialStateAsEvents, typeof(IClusterDomainEvent)); base.PreStart(); } @@ -62,123 +69,27 @@ public override void Down(UniqueAddress node, IDecision decision) } /// - /// The implementation is split into two classes SplitBrainResolver and SplitBrainResolverBase to be - /// able to unit test the logic without running cluster. + /// The implementation is split into two classes SplitBrainResolver and SplitBrainResolverBase to be + /// able to unit test the logic without running cluster. /// internal abstract class SplitBrainResolverBase : ActorBase, IWithUnboundedStash, IWithTimers { - internal class Tick - { - public static readonly Tick Instance = new Tick(); - - private Tick() - { - } - } - - /// - /// Response (result) of the acquire lease request. - /// - protected class AcquireLeaseResult - { - public AcquireLeaseResult(bool holdingLease) - { - HoldingLease = holdingLease; - } - - public bool HoldingLease { get; } - } - - /// - /// Response (result) of the release lease request. - /// - protected class ReleaseLeaseResult - { - public ReleaseLeaseResult(bool released) - { - Released = released; - } - - public bool Released { get; } - } - - /// - /// For delayed acquire of the lease. - /// - protected class AcquireLease - { - public static readonly AcquireLease Instance = new AcquireLease(); - - private AcquireLease() - { - } - } - - protected class ReachabilityChangedStats - { - public ReachabilityChangedStats(DateTime firstChangeTimestamp, DateTime latestChangeTimestamp, long changeCount) - { - FirstChangeTimestamp = firstChangeTimestamp; - LatestChangeTimestamp = latestChangeTimestamp; - ChangeCount = changeCount; - } - - public DateTime FirstChangeTimestamp { get; } - public DateTime LatestChangeTimestamp { get; } - public long ChangeCount { get; } - - public bool IsEmpty => ChangeCount == 0; - - public override string ToString() - { - if (IsEmpty) - return "reachability unchanged"; - else - { - var now = DateTime.UtcNow; - return $"reachability changed {ChangeCount} times since {(now - FirstChangeTimestamp).TotalMilliseconds} ms ago, " + - $"latest change was {(now - LatestChangeTimestamp).TotalMilliseconds} ms ago"; - } - } - } - - protected interface IReleaseLeaseCondition - { - } + private readonly TimeSpan releaseLeaseAfter; - protected static class ReleaseLeaseCondition - { - public class NoLease : IReleaseLeaseCondition - { - public static readonly NoLease Instance = new NoLease(); - - private NoLease() - { - } - } + // would be better as constructor parameter, but don't want to break Cinnamon instrumentation + private readonly SplitBrainResolverSettings settings; + private ILoggingAdapter _log; - public class WhenMembersRemoved : IReleaseLeaseCondition - { - public WhenMembersRemoved(ImmutableHashSet nodes) - { - Nodes = nodes; - } - public ImmutableHashSet Nodes { get; } - } + private ReachabilityChangedStats reachabilityChangedStats = + new ReachabilityChangedStats(DateTime.UtcNow, DateTime.UtcNow, 0); - public class WhenTimeElapsed : IReleaseLeaseCondition - { - public WhenTimeElapsed(Deadline deadline) - { - Deadline = deadline; - } + private IReleaseLeaseCondition releaseLeaseCondition = ReleaseLeaseCondition.NoLease.Instance; + private bool selfMemberAdded; - public Deadline Deadline { get; } - } - } + private Deadline stableDeadline; - public SplitBrainResolverBase(TimeSpan stableAfter, DowningStrategy strategy) + protected SplitBrainResolverBase(TimeSpan stableAfter, DowningStrategy strategy) { StableAfter = stableAfter; Strategy = strategy; @@ -186,6 +97,7 @@ public SplitBrainResolverBase(TimeSpan stableAfter, DowningStrategy strategy) settings = new SplitBrainResolverSettings(Context.System.Settings.Config); releaseLeaseAfter = stableAfter + stableAfter; + // ReSharper disable once VirtualMemberCallInConstructor Timers.StartPeriodicTimer(Tick.Instance, Tick.Instance, TickInterval); ResetStableDeadline(); @@ -195,48 +107,38 @@ public SplitBrainResolverBase(TimeSpan stableAfter, DowningStrategy strategy) public DowningStrategy Strategy { get; } - public IStash Stash { get; set; } - - public ITimerScheduler Timers { get; set; } - - - - private ILoggingAdapter _log; - public ILoggingAdapter Log => _log ?? (_log = Context.GetLogger()); public abstract UniqueAddress SelfUniqueAddress { get; } - public abstract void Down(UniqueAddress node, IDecision decision); + public virtual TimeSpan DownAllWhenUnstable => settings.DownAllWhenUnstable; - // would be better as constructor parameter, but don't want to break Cinnamon instrumentation - private SplitBrainResolverSettings settings; + public virtual TimeSpan TickInterval => TimeSpan.FromSeconds(1); - public virtual TimeSpan DownAllWhenUnstable => settings.DownAllWhenUnstable; + protected bool Leader { get; private set; } - private TimeSpan releaseLeaseAfter; + public bool IsResponsible => Leader && selfMemberAdded; - public virtual TimeSpan TickInterval => TimeSpan.FromSeconds(1); + public ITimerScheduler Timers { get; set; } - protected bool Leader { get; private set; } = false; - private bool selfMemberAdded = false; + public IStash Stash { get; set; } + + public abstract void Down(UniqueAddress node, IDecision decision); // private def internalDispatcher: ExecutionContext = // context.system.asInstanceOf[ExtendedActorSystem].dispatchers.internalDispatcher // overridden in tests - protected virtual Deadline NewStableDeadline() => Deadline.Now + StableAfter; - - private Deadline stableDeadline; + protected virtual Deadline NewStableDeadline() + { + return Deadline.Now + StableAfter; + } public void ResetStableDeadline() { stableDeadline = NewStableDeadline(); } - - private ReachabilityChangedStats reachabilityChangedStats = new ReachabilityChangedStats(DateTime.UtcNow, DateTime.UtcNow, 0); - private void ResetReachabilityChangedStats() { var now = DateTime.UtcNow; @@ -252,10 +154,9 @@ private void ResetReachabilityChangedStatsIfAllUnreachableDowned() } } - private IReleaseLeaseCondition releaseLeaseCondition = ReleaseLeaseCondition.NoLease.Instance; - /// - /// Helper to wrap updates to strategy info with, so that stable-after timer is reset and information is logged about state change */ + /// Helper to wrap updates to strategy info with, so that stable-after timer is reset and information is logged about + /// state change */ /// /// /// @@ -265,34 +166,31 @@ public void MutateMemberInfo(bool resetStable, Action f) f(); var unreachableAfter = Strategy.Unreachable.Count; - string EarliestTimeOfDecision() => (DateTime.UtcNow + StableAfter).ToString(); + string EarliestTimeOfDecision() + { + return (DateTime.UtcNow + StableAfter).ToString(CultureInfo.InvariantCulture); + } if (resetStable) { if (IsResponsible) { if (unreachableBefore == 0 && unreachableAfter > 0) - { Log.Info( - "SBR found unreachable members, waiting for stable-after = {0} ms before taking downing decision. " + - "Now {1} unreachable members found. Downing decision will not be made before {3}.", - StableAfter.TotalMilliseconds, - unreachableAfter, - EarliestTimeOfDecision()); - } + "SBR found unreachable members, waiting for stable-after = {0} ms before taking downing decision. " + + "Now {1} unreachable members found. Downing decision will not be made before {3}.", + StableAfter.TotalMilliseconds, + unreachableAfter, + EarliestTimeOfDecision()); else if (unreachableBefore > 0 && unreachableAfter == 0) - { Log.Info( - "SBR found all unreachable members healed during stable-after period, no downing decision necessary for now."); - } + "SBR found all unreachable members healed during stable-after period, no downing decision necessary for now."); else if (unreachableAfter > 0) - { Log.Info( - "SBR found unreachable members changed during stable-after period. Resetting timer. " + - "Now {0} unreachable members found. Downing decision will not be made before {1}.", - unreachableAfter, - EarliestTimeOfDecision()); - } + "SBR found unreachable members changed during stable-after period. Resetting timer. " + + "Now {0} unreachable members found. Downing decision will not be made before {1}.", + unreachableAfter, + EarliestTimeOfDecision()); // else no unreachable members found but set of members changed } @@ -302,7 +200,7 @@ public void MutateMemberInfo(bool resetStable, Action f) } /// - /// Helper to wrap updates to `leader` and `selfMemberAdded` to log changes in responsibility status */ + /// Helper to wrap updates to `leader` and `selfMemberAdded` to log changes in responsibility status */ /// /// public void MutateResponsibilityInfo(Action f) @@ -313,8 +211,8 @@ public void MutateResponsibilityInfo(Action f) if (!responsibleBefore && responsibleAfter) Log.Info( - "This node is now the leader responsible for taking SBR decisions among the reachable nodes " + - "(more leaders may exist)."); + "This node is now the leader responsible for taking SBR decisions among the reachable nodes " + + "(more leaders may exist)."); else if (responsibleBefore && !responsibleAfter) Log.Info("This node is not the leader any more and not responsible for taking SBR decisions."); @@ -325,11 +223,9 @@ public void MutateResponsibilityInfo(Action f) protected override void PostStop() { if (!(releaseLeaseCondition is ReleaseLeaseCondition.NoLease)) - { Log.Info( - "SBR is stopped and owns the lease. The lease will not be released until after the " + - "lease heartbeat-timeout."); - } + "SBR is stopped and owns the lease. The lease will not be released until after the " + + "lease heartbeat-timeout."); base.PostStop(); } @@ -384,15 +280,13 @@ protected override bool Receive(object message) // not interested in other events return true; } + return false; } private void LeaderChanged(Address leaderOption) { - MutateResponsibilityInfo(() => - { - Leader = leaderOption?.Equals(SelfUniqueAddress.Address) == true; - }); + MutateResponsibilityInfo(() => { Leader = leaderOption?.Equals(SelfUniqueAddress.Address) == true; }); } private void OnTick() @@ -402,28 +296,28 @@ private void OnTick() if (reachabilityChangedStats.ChangeCount > 0) { var now = DateTime.UtcNow; - var durationSinceLatestChange = (now - reachabilityChangedStats.LatestChangeTimestamp); - var durationSinceFirstChange = (now - reachabilityChangedStats.FirstChangeTimestamp); + var durationSinceLatestChange = now - reachabilityChangedStats.LatestChangeTimestamp; + var durationSinceFirstChange = now - reachabilityChangedStats.FirstChangeTimestamp; var downAllWhenUnstableEnabled = DownAllWhenUnstable > TimeSpan.Zero; - if (downAllWhenUnstableEnabled && durationSinceFirstChange > (StableAfter + DownAllWhenUnstable)) + if (downAllWhenUnstableEnabled && durationSinceFirstChange > StableAfter + DownAllWhenUnstable) { Log.Warning( - //ClusterLogMarker.sbrInstability, - "SBR detected instability and will down all nodes: {0}", - reachabilityChangedStats); + //ClusterLogMarker.sbrInstability, + "SBR detected instability and will down all nodes: {0}", + reachabilityChangedStats); ActOnDecision(DownAll.Instance); } - else if (!downAllWhenUnstableEnabled && durationSinceLatestChange > (StableAfter + StableAfter)) + else if (!downAllWhenUnstableEnabled && durationSinceLatestChange > StableAfter + StableAfter) { // downAllWhenUnstable is disabled but reset for meaningful logging - Log.Debug("SBR no reachability changes within {0} ms, resetting stats", (StableAfter + StableAfter).TotalMilliseconds); + Log.Debug("SBR no reachability changes within {0} ms, resetting stats", + (StableAfter + StableAfter).TotalMilliseconds); ResetReachabilityChangedStats(); } } if (IsResponsible && !Strategy.Unreachable.IsEmpty && stableDeadline.IsOverdue) - { switch (Strategy.Decide()) { case IAcquireLeaseDecision decision: @@ -434,31 +328,38 @@ private void OnTick() if (lease.CheckLease()) { Log.Info( - "SBR has acquired lease for decision [{0}]", - decision); + "SBR has acquired lease for decision [{0}]", + decision); ActOnDecision(decision); } else { if (decision.AcquireDelay == TimeSpan.Zero) + { OnAcquireLease(); // reply message is AcquireLeaseResult + } else { - Log.Debug("SBR delayed attempt to acquire lease for [{0} ms]", decision.AcquireDelay.TotalMilliseconds); - Timers.StartSingleTimer(AcquireLease.Instance, AcquireLease.Instance, decision.AcquireDelay); + Log.Debug("SBR delayed attempt to acquire lease for [{0} ms]", + decision.AcquireDelay.TotalMilliseconds); + Timers.StartSingleTimer(AcquireLease.Instance, AcquireLease.Instance, + decision.AcquireDelay); } + Context.Become(WaitingForLease(decision)); } + break; default: - throw new InvalidOperationException("Unexpected lease decision although lease is not configured"); + throw new InvalidOperationException( + "Unexpected lease decision although lease is not configured"); } + break; case IDecision decision: ActOnDecision(decision); break; } - } switch (releaseLeaseCondition) { @@ -466,9 +367,6 @@ private void OnTick() if (rlc.Deadline.IsOverdue) ReleaseLease(); // reply message is ReleaseLeaseResult, which will update the releaseLeaseCondition break; - default: - // no lease or first waiting for downed nodes to be removed - break; } } @@ -476,16 +374,13 @@ private void OnAcquireLease() { Log.Debug("SBR trying to acquire lease"); //implicit val ec: ExecutionContext = internalDispatcher - if (Strategy.Lease != null) - { - Strategy.Lease.Acquire().ContinueWith(r => + Strategy.Lease?.Acquire().ContinueWith(r => { if (r.IsFaulted) Log.Error(r.Exception, "SBR acquire of lease failed"); return new AcquireLeaseResult(!r.IsFaulted ? r.Result : false); }) .PipeTo(Self); - } } public Receive WaitingForLease(IDecision decision) @@ -506,13 +401,16 @@ bool Receive(object message) switch (releaseLeaseCondition) { case ReleaseLeaseCondition.WhenMembersRemoved rlc: - releaseLeaseCondition = new ReleaseLeaseCondition.WhenMembersRemoved(rlc.Nodes.Union(downedNodes)); + releaseLeaseCondition = + new ReleaseLeaseCondition.WhenMembersRemoved(rlc.Nodes.Union(downedNodes)); break; default: if (downedNodes.IsEmpty) - releaseLeaseCondition = new ReleaseLeaseCondition.WhenTimeElapsed(Deadline.Now + releaseLeaseAfter); + releaseLeaseCondition = + new ReleaseLeaseCondition.WhenTimeElapsed(Deadline.Now + releaseLeaseAfter); else - releaseLeaseCondition = new ReleaseLeaseCondition.WhenMembersRemoved(downedNodes); + releaseLeaseCondition = + new ReleaseLeaseCondition.WhenMembersRemoved(downedNodes); break; } } @@ -556,9 +454,7 @@ private void OnReleaseLeaseResult(bool released) Log.Info("SBR released lease."); releaseLeaseCondition = ReleaseLeaseCondition.NoLease.Instance; // released successfully } - break; - default: - // no lease or first waiting for downed nodes to be removed + break; } } @@ -588,43 +484,42 @@ public ImmutableHashSet ActOnDecision(IDecision decision) // downing is idempotent, and we also avoid calling down on nodes with status Down // down selfAddress last, since it may shutdown itself if down alone foreach (var uniqueAddress in nodesToDown) - { if (!uniqueAddress.Equals(SelfUniqueAddress)) Down(uniqueAddress, decision); - } if (downMyself) Down(SelfUniqueAddress, decision); ResetReachabilityChangedStats(); ResetStableDeadline(); } + return nodesToDown; } public void ObserveDecision( - IDecision decision, - ImmutableHashSet nodesToDown - ) + IDecision decision, + ImmutableHashSet nodesToDown + ) { var downMyself = nodesToDown.Contains(SelfUniqueAddress); - var indirectlyConnectedLogMessage = decision.IsIndirectlyConnected ? $", indirectly connected [{string.Join(", ", Strategy.IndirectlyConnected)}]" : ""; + var indirectlyConnectedLogMessage = decision.IsIndirectlyConnected + ? $", indirectly connected [{string.Join(", ", Strategy.IndirectlyConnected)}]" + : ""; Log.Warning( - $"SBR took decision {decision} and is downing [{string.Join(", ", nodesToDown.Select(i => i.Address))}]{(downMyself ? " including myself, " : "")}, " + - $"[{Strategy.Unreachable.Count}] unreachable of [{Strategy.Members.Count}] members" + - indirectlyConnectedLogMessage + - $", full reachability status: {Strategy.Reachability}"); + $"SBR took decision {decision} and is downing [{string.Join(", ", nodesToDown.Select(i => i.Address))}]{(downMyself ? " including myself, " : "")}, " + + $"[{Strategy.Unreachable.Count}] unreachable of [{Strategy.Members.Count}] members" + + indirectlyConnectedLogMessage + + $", full reachability status: {Strategy.Reachability}"); } - public bool IsResponsible => Leader && selfMemberAdded; - public void UnreachableMember(Member m) { if (!m.UniqueAddress.Equals(SelfUniqueAddress)) { Log.Debug("SBR unreachableMember [{0}]", m); - MutateMemberInfo(resetStable: true, () => + MutateMemberInfo(true, () => { Strategy.AddUnreachable(m); UpdateReachabilityChangedStats(); @@ -640,7 +535,7 @@ public void ReachableMember(Member m) if (!m.UniqueAddress.Equals(SelfUniqueAddress)) { Log.Debug("SBR reachableMember [{0}]", m); - MutateMemberInfo(resetStable: true, () => + MutateMemberInfo(true, () => { Strategy.AddReachable(m); UpdateReachabilityChangedStats(); @@ -666,7 +561,7 @@ private void UpdateReachabilityChangedStats() reachabilityChangedStats.FirstChangeTimestamp, now, reachabilityChangedStats.ChangeCount + 1 - ); + ); } public void SeenChanged(ImmutableHashSet
seenBy) @@ -677,14 +572,11 @@ public void SeenChanged(ImmutableHashSet
seenBy) public void AddUp(Member m) { Log.Debug("SBR add Up [{0}]", m); - MutateMemberInfo(resetStable: true, () => + MutateMemberInfo(true, () => { Strategy.Add(m); if (m.UniqueAddress.Equals(SelfUniqueAddress)) - MutateResponsibilityInfo(() => - { - selfMemberAdded = true; - }); + MutateResponsibilityInfo(() => { selfMemberAdded = true; }); }); switch (Strategy) { @@ -698,19 +590,13 @@ public void AddUp(Member m) s.QuorumSize, s.QuorumSize * 2 - 1); break; - default: - // ok - break; } } public void Leaving(Member m) { Log.Debug("SBR leaving [{0}]", m); - MutateMemberInfo(resetStable: false, () => - { - Strategy.Add(m); - }); + MutateMemberInfo(false, () => { Strategy.Add(m); }); } public void AddJoining(Member m) @@ -722,10 +608,7 @@ public void AddJoining(Member m) public void AddWeaklyUp(Member m) { if (m.UniqueAddress.Equals(SelfUniqueAddress)) - MutateResponsibilityInfo(() => - { - selfMemberAdded = true; - }); + MutateResponsibilityInfo(() => { selfMemberAdded = true; }); // treat WeaklyUp in same way as joining AddJoining(m); } @@ -735,8 +618,7 @@ public void Remove(Member m) if (m.UniqueAddress.Equals(SelfUniqueAddress)) Context.Stop(Self); else - { - MutateMemberInfo(resetStable: false, () => + MutateMemberInfo(false, () => { Log.Debug("SBR remove [{0}]", m); Strategy.Remove(m); @@ -749,29 +631,136 @@ public void Remove(Member m) var remainingDownedNodes = rlc.Nodes.Remove(m.UniqueAddress); if (remainingDownedNodes.IsEmpty) - releaseLeaseCondition = new ReleaseLeaseCondition.WhenTimeElapsed(Deadline.Now + releaseLeaseAfter); + releaseLeaseCondition = + new ReleaseLeaseCondition.WhenTimeElapsed(Deadline.Now + releaseLeaseAfter); else - releaseLeaseCondition = new ReleaseLeaseCondition.WhenMembersRemoved(remainingDownedNodes); - break; - default: - // no lease or not holding lease + releaseLeaseCondition = + new ReleaseLeaseCondition.WhenMembersRemoved(remainingDownedNodes); break; } }); - } } private void ReleaseLease() { // implicit val ec: ExecutionContext = internalDispatcher if (Strategy.Lease != null) - { if (!(releaseLeaseCondition is ReleaseLeaseCondition.NoLease)) { Log.Debug("SBR releasing lease"); - Strategy.Lease.Release().ContinueWith(r => new ReleaseLeaseResult(!r.IsFaulted ? r.Result : false)).PipeTo(Self); + Strategy.Lease.Release().ContinueWith(r => new ReleaseLeaseResult(!r.IsFaulted ? r.Result : false)) + .PipeTo(Self); + } + } + + internal class Tick + { + public static readonly Tick Instance = new Tick(); + + private Tick() + { + } + } + + /// + /// Response (result) of the acquire lease request. + /// + protected class AcquireLeaseResult + { + public AcquireLeaseResult(bool holdingLease) + { + HoldingLease = holdingLease; + } + + public bool HoldingLease { get; } + } + + /// + /// Response (result) of the release lease request. + /// + protected class ReleaseLeaseResult + { + public ReleaseLeaseResult(bool released) + { + Released = released; + } + + public bool Released { get; } + } + + /// + /// For delayed acquire of the lease. + /// + protected class AcquireLease + { + public static readonly AcquireLease Instance = new AcquireLease(); + + private AcquireLease() + { + } + } + + protected class ReachabilityChangedStats + { + public ReachabilityChangedStats(DateTime firstChangeTimestamp, DateTime latestChangeTimestamp, + long changeCount) + { + FirstChangeTimestamp = firstChangeTimestamp; + LatestChangeTimestamp = latestChangeTimestamp; + ChangeCount = changeCount; + } + + public DateTime FirstChangeTimestamp { get; } + public DateTime LatestChangeTimestamp { get; } + public long ChangeCount { get; } + + public bool IsEmpty => ChangeCount == 0; + + public override string ToString() + { + if (IsEmpty) return "reachability unchanged"; + + var now = DateTime.UtcNow; + return + $"reachability changed {ChangeCount} times since {(now - FirstChangeTimestamp).TotalMilliseconds} ms ago, " + + $"latest change was {(now - LatestChangeTimestamp).TotalMilliseconds} ms ago"; + } + } + + protected interface IReleaseLeaseCondition + { + } + + protected static class ReleaseLeaseCondition + { + public class NoLease : IReleaseLeaseCondition + { + public static readonly NoLease Instance = new NoLease(); + + private NoLease() + { } } + + public class WhenMembersRemoved : IReleaseLeaseCondition + { + public WhenMembersRemoved(ImmutableHashSet nodes) + { + Nodes = nodes; + } + + public ImmutableHashSet Nodes { get; } + } + + public class WhenTimeElapsed : IReleaseLeaseCondition + { + public WhenTimeElapsed(Deadline deadline) + { + Deadline = deadline; + } + + public Deadline Deadline { get; } + } } } } \ No newline at end of file diff --git a/src/core/Akka.Cluster/SBR/SplitBrainResolverProvider.cs b/src/core/Akka.Cluster/SBR/SplitBrainResolverProvider.cs index 1ed08de91e2..a917f612256 100644 --- a/src/core/Akka.Cluster/SBR/SplitBrainResolverProvider.cs +++ b/src/core/Akka.Cluster/SBR/SplitBrainResolverProvider.cs @@ -1,21 +1,26 @@ -using System; -using System.Collections.Generic; -using System.Text; +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2021 Lightbend Inc. +// Copyright (C) 2013-2021 .NET Foundation +// +//----------------------------------------------------------------------- + +using System; using Akka.Actor; using Akka.Coordination; namespace Akka.Cluster.SBR { /// - /// Enabled with configuration: - /// { - /// akka.cluster.downing-provider-class = "Akka.Cluster.SBR.SplitBrainResolverProvider" - /// } + /// Enabled with configuration: + /// { + /// akka.cluster.downing-provider-class = "Akka.Cluster.SBR.SplitBrainResolverProvider" + /// } /// public class SplitBrainResolverProvider : IDowningProvider { - private readonly ActorSystem system; private readonly SplitBrainResolverSettings settings; + private readonly ActorSystem system; public SplitBrainResolverProvider(ActorSystem system) { @@ -34,8 +39,7 @@ public TimeSpan DownRemovalMargin #pragma warning restore CS0618 // Type or member is obsolete if (drm != TimeSpan.Zero) return drm; - else - return settings.DowningStableAfter; + return settings.DowningStableAfter; } } @@ -65,7 +69,8 @@ public Props DowningActorProps case SplitBrainResolverSettings.LeaseMajorityName: var lms = settings.LeaseMajoritySettings; var leaseOwnerName = cluster.SelfUniqueAddress.Address.HostPort(); - var lease = LeaseProvider.Get(system).GetLease($"{system.Name}-akka-sbr", lms.LeaseImplementation, leaseOwnerName); + var lease = LeaseProvider.Get(system).GetLease($"{system.Name}-akka-sbr", + lms.LeaseImplementation, leaseOwnerName); strategy = new LeaseMajority(lms.Role, lease, lms.AcquireLeaseDelayForMinority); break; default: @@ -76,4 +81,4 @@ public Props DowningActorProps } } } -} +} \ No newline at end of file diff --git a/src/core/Akka.Cluster/SBR/SplitBrainResolverSettings.cs b/src/core/Akka.Cluster/SBR/SplitBrainResolverSettings.cs index 07ba6274bdf..467ea25a297 100644 --- a/src/core/Akka.Cluster/SBR/SplitBrainResolverSettings.cs +++ b/src/core/Akka.Cluster/SBR/SplitBrainResolverSettings.cs @@ -1,7 +1,12 @@ -using System; -using System.Collections.Generic; +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2021 Lightbend Inc. +// Copyright (C) 2013-2021 .NET Foundation +// +//----------------------------------------------------------------------- + +using System; using System.Collections.Immutable; -using System.Text; using Akka.Configuration; using Akka.Util.Internal; @@ -15,22 +20,30 @@ public sealed class SplitBrainResolverSettings public const string KeepOldestName = "keep-oldest"; public const string DownAllName = "down-all"; - public readonly static ImmutableHashSet AllStrategyNames = ImmutableHashSet.Create(KeepMajorityName, LeaseMajorityName, StaticQuorumName, KeepOldestName, DownAllName); + public static readonly ImmutableHashSet AllStrategyNames = ImmutableHashSet.Create(KeepMajorityName, + LeaseMajorityName, StaticQuorumName, KeepOldestName, DownAllName); + + private readonly Lazy lazyKeepMajorityRole; + private readonly Lazy lazyKeepOldestSettings; + private readonly Lazy lazyLeaseMajoritySettings; + private readonly Lazy lazyStaticQuorumSettings; public SplitBrainResolverSettings(Config config) { var cc = config.GetConfig("akka.cluster.split-brain-resolver"); if (cc.IsNullOrEmpty()) - throw ConfigurationException.NullOrEmptyConfig("akka.cluster.split-brain-resolver"); + throw ConfigurationException.NullOrEmptyConfig( + "akka.cluster.split-brain-resolver"); DowningStableAfter = cc.GetTimeSpan("stable-after"); if (DowningStableAfter <= TimeSpan.Zero) - throw new ConfigurationException($"'split-brain-resolver.stable-after' must be >= 0s"); + throw new ConfigurationException("'split-brain-resolver.stable-after' must be >= 0s"); DowningStrategy = cc.GetString("active-strategy")?.ToLowerInvariant(); if (!AllStrategyNames.Contains(DowningStrategy)) - throw new ConfigurationException($"Unknown downing strategy 'split-brain-resolver.active-strategy'=[{DowningStrategy}]. Select one of [{string.Join(", ", AllStrategyNames)}]"); + throw new ConfigurationException( + $"Unknown downing strategy 'split-brain-resolver.active-strategy'=[{DowningStrategy}]. Select one of [{string.Join(", ", AllStrategyNames)}]"); { var key = "down-all-when-unstable"; @@ -38,7 +51,8 @@ public SplitBrainResolverSettings(Config config) { case "on": // based on stable-after - DownAllWhenUnstable = TimeSpan.FromSeconds(4).Max(new TimeSpan(DowningStableAfter.Ticks * 3 / 4)); + DownAllWhenUnstable = + TimeSpan.FromSeconds(4).Max(new TimeSpan(DowningStableAfter.Ticks * 3 / 4)); break; case "off": // disabled @@ -47,7 +61,8 @@ public SplitBrainResolverSettings(Config config) default: DownAllWhenUnstable = cc.GetTimeSpan(key); if (DowningStableAfter <= TimeSpan.Zero) - throw new ConfigurationException($"'split-brain-resolver.{key}' must be >= 0s or 'off' to disable"); + throw new ConfigurationException( + $"'split-brain-resolver.{key}' must be >= 0s or 'off' to disable"); break; } } @@ -67,17 +82,15 @@ string Role(Config c) return r; } - lazyKeepMajorityRole = new Lazy(() => - { - return Role(StrategyConfig(KeepMajorityName)); - }); + lazyKeepMajorityRole = new Lazy(() => { return Role(StrategyConfig(KeepMajorityName)); }); lazyStaticQuorumSettings = new Lazy(() => { var c = StrategyConfig(StaticQuorumName); var size = c.GetInt("quorum-size"); if (size < 1) - throw new ConfigurationException($"'split-brain-resolver.{StaticQuorumName}.quorum-size' must be >= 1"); + throw new ConfigurationException( + $"'split-brain-resolver.{StaticQuorumName}.quorum-size' must be >= 1"); return new StaticQuorumSettings(size, Role(c)); }); @@ -95,7 +108,8 @@ string Role(Config c) var c = StrategyConfig(LeaseMajorityName); var leaseImplementation = c.GetString("lease-implementation"); if (string.IsNullOrEmpty(leaseImplementation)) - throw new ConfigurationException($"'split-brain-resolver.{LeaseMajorityName}.lease-implementation' must be defined"); + throw new ConfigurationException( + $"'split-brain-resolver.{LeaseMajorityName}.lease-implementation' must be defined"); var acquireLeaseDelayForMinority = c.GetTimeSpan("acquire-lease-delay-for-minority"); @@ -103,11 +117,6 @@ string Role(Config c) }); } - private readonly Lazy lazyKeepMajorityRole; - private readonly Lazy lazyStaticQuorumSettings; - private readonly Lazy lazyKeepOldestSettings; - private readonly Lazy lazyLeaseMajoritySettings; - public TimeSpan DowningStableAfter { get; } public string DowningStrategy { get; } @@ -125,44 +134,43 @@ string Role(Config c) public sealed class StaticQuorumSettings { - public int Size { get; } - - public string Role { get; } - public StaticQuorumSettings(int size, string role) { Size = size; Role = role; } - } - public sealed class KeepOldestSettings - { - public bool DownIfAlone { get; } + public int Size { get; } public string Role { get; } + } + public sealed class KeepOldestSettings + { public KeepOldestSettings(bool downIfAlone, string role) { DownIfAlone = downIfAlone; Role = role; } - } - - public sealed class LeaseMajoritySettings - { - public string LeaseImplementation { get; } - public TimeSpan AcquireLeaseDelayForMinority { get; } + public bool DownIfAlone { get; } public string Role { get; } + } + public sealed class LeaseMajoritySettings + { public LeaseMajoritySettings(string leaseImplementation, TimeSpan acquireLeaseDelayForMinority, string role) { LeaseImplementation = leaseImplementation; AcquireLeaseDelayForMinority = acquireLeaseDelayForMinority; Role = role; } - } -} + public string LeaseImplementation { get; } + + public TimeSpan AcquireLeaseDelayForMinority { get; } + + public string Role { get; } + } +} \ No newline at end of file