Skip to content

Commit

Permalink
disseminate downing decisions faster (Migrated from akka/akka#29640) (#…
Browse files Browse the repository at this point in the history
…4598)

* when SBR downs the reachable side (minority) it's important
  to quickly inform everybody to shutdown
* send gossip directly to downed node, STONITH signal
* gossip to a few random immediately when self is downed, which
  is always the last from the SBR downing
* enable gossip speedup when there are downed members
* adjust TransitionSpect to the new behavior

* update ValidNodeForGossip behavior

Co-authored-by: Aaron Stannard <aaron@petabridge.com>
  • Loading branch information
zbynek001 and Aaronontheweb authored Nov 3, 2020
1 parent 59e149d commit f628ba9
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 4 deletions.
13 changes: 12 additions & 1 deletion src/core/Akka.Cluster.Tests.MultiNode/TransitionSpec.cs
Original file line number Diff line number Diff line change
Expand Up @@ -332,12 +332,23 @@ private void A_Cluster_must_perform_correct_transitions_when_second_becomes_unav
}, _config.First);

EnterBarrier("after-second-down");

RunOn(() =>
{
// first should have sent immediate gossip to second when it downed second
// and second should then shutdown
AwaitAssert(() => Cluster.IsTerminated.Should().BeTrue());
}, _config.Second);

EnterBarrier("second-received-down");

GossipTo(_config.First, _config.Third);
RunOn(() =>
{
AwaitAssert(() => ClusterView.UnreachableMembers.Select(c => c.Address).Should().Contain(GetAddress(_config.Second)));
AwaitMemberStatus(GetAddress(_config.Second), Akka.Cluster.MemberStatus.Down);
AwaitAssert(() => SeenLatestGossip().Should().BeEquivalentTo(ImmutableHashSet.Create(_config.First, _config.Third)));
// second will also gossip when it shuts down, so it has seen it
AwaitAssert(() => SeenLatestGossip().Should().BeEquivalentTo(ImmutableHashSet.Create(_config.First, _config.Second, _config.Third)));
}, _config.First, _config.Third);

EnterBarrier("after-4");
Expand Down
23 changes: 20 additions & 3 deletions src/core/Akka.Cluster/ClusterDaemon.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,17 @@ public void Downing(Address address)
UpdateLatestGossip(newGossip);

Publish(_latestGossip);

if (address == _cluster.SelfAddress)
{
// spread the word quickly, without waiting for next gossip tick
SendGossipRandom(MaxGossipsBeforeShuttingDownMyself);
}
else
{
// try to gossip immediately to downed node, as a STONITH signal
GossipTo(member.UniqueAddress);
}
}
else if (member != null)
{
Expand Down Expand Up @@ -1994,6 +2005,12 @@ public ReceiveGossipType ReceiveGossip(GossipEnvelope envelope)
_coordShutdown.Run(CoordinatedShutdown.ClusterLeavingReason.Instance);
}

if (selfStatus == MemberStatus.Down && localGossip.GetMember(SelfUniqueAddress).Status != MemberStatus.Down)
{
_log.Warning("Received gossip where this member has been downed, from [{0}]", from.Address);
ShutdownSelfWhenDown();
}

if (talkback)
{
// send back gossip to sender() when sender() had different view, i.e. merge, or sender() had
Expand Down Expand Up @@ -2032,7 +2049,8 @@ public void GossipSpeedupTick()
/// </summary>
public bool IsGossipSpeedupNeeded()
{
return _latestGossip.Overview.Seen.Count < _latestGossip.Members.Count / 2;
return _latestGossip.Members.Any(m => m.Status == MemberStatus.Down) ||
_latestGossip.Overview.Seen.Count < _latestGossip.Members.Count / 2;
}

private void SendGossipRandom(int n)
Expand Down Expand Up @@ -2541,8 +2559,7 @@ public void GossipStatusTo(UniqueAddress node, IActorRef destination)
/// <returns>TBD</returns>
public bool ValidNodeForGossip(UniqueAddress node)
{
return !node.Equals(SelfUniqueAddress) && _latestGossip.HasMember(node) &&
_latestGossip.ReachabilityExcludingDownedObservers.Value.IsReachable(node);
return !node.Equals(SelfUniqueAddress) && _latestGossip.Overview.Reachability.IsReachable(SelfUniqueAddress, node);
}

/// <summary>
Expand Down

0 comments on commit f628ba9

Please sign in to comment.