Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix suicide probe bug #250

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// -----------------------------------------------------------------------
// <copyright file="RegressionProbeFailureSpec.cs" company="Petabridge, LLC">
// Copyright (C) 2015 - 2023 Petabridge, LLC <https://petabridge.com>
// </copyright>
// -----------------------------------------------------------------------

using System;
using System.Diagnostics;
using System.Linq;
using System.Threading.Tasks;
using Akka.Actor;
using Akka.Persistence.TestKit;
using FluentAssertions;
using FluentAssertions.Extensions;
using Xunit;
using Xunit.Abstractions;
using LogEvent = Akka.Event.LogEvent;

namespace Akka.HealthCheck.Persistence.Tests;

public class RegressionProbeFailureSpec: PersistenceTestKit
{
public RegressionProbeFailureSpec(ITestOutputHelper output) : base("akka.loglevel = DEBUG", nameof(RegressionProbeFailureSpec), output)
{
}

[Fact(DisplayName = "Probe should be performed in proper interval with snapshot recovery failure")]
public async Task IntervalTest()
{
await WithSnapshotLoad(load => load.Fail(), async () =>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

{
Sys.EventStream.Subscribe(TestActor, typeof(LogEvent));
var probe = Sys.ActorOf(Props.Create(() =>
new AkkaPersistenceLivenessProbe(true, TimeSpan.FromMilliseconds(400))));
await FishForMessageAsync<LogEvent>(e => e.Message.ToString() is "Recreating persistence probe.");

var stopwatch = Stopwatch.StartNew();
// Default circuit breaker max-failures is 10
foreach (var _ in Enumerable.Range(0, 15))
{
stopwatch.Restart();
await FishForMessageAsync<LogEvent>(e => e.Message.ToString() is "Recreating persistence probe.");
stopwatch.Stop();
// In the original issue, suicide probe is being recreated immediately after failure without waiting
stopwatch.Elapsed.Should().BeGreaterThan(300.Milliseconds());
}
});
}
}
82 changes: 38 additions & 44 deletions src/Akka.HealthCheck.Persistence/AkkaPersistenceLivenessProbe.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ public override string ToString()
$"{nameof(Failures)}={Failures?.ToString() ?? "null"})";
}
}

internal sealed class CreateProbe
{
public static readonly CreateProbe Instance = new();

private CreateProbe()
{
}
}

public class AkkaPersistenceLivenessProbe : ActorBase
{
Expand All @@ -85,6 +94,8 @@ public AkkaPersistenceLivenessProbe(bool logInfo, TimeSpan delay)
_id = Guid.NewGuid().ToString("N");
_shutdownCancellable = new Cancelable(Context.System.Scheduler);
_logInfo = logInfo;

Become(obj => HandleMessages(obj) || HandleSubscriptions(obj));
}

public static Props PersistentHealthCheckProps(bool logInfo, TimeSpan delay)
Expand All @@ -96,6 +107,7 @@ public static Props PersistentHealthCheckProps(bool logInfo, TimeSpan delay)

protected override void PostStop()
{
_probe?.Tell(PoisonPill.Instance);
_shutdownCancellable.Cancel();
_shutdownCancellable.Dispose();
base.PostStop();
Expand Down Expand Up @@ -127,25 +139,6 @@ private bool HandleSubscriptions(object msg)
return true;
}

private bool Started(object message)
{
switch (message)
{
case Terminated t when t.ActorRef.Equals(_probe):
Context.Unwatch(_probe);
if(_logInfo)
_log.Debug("Persistence probe terminated. Recreating...");
CreateProbe();
Become(obj => Recreating(obj) || HandleSubscriptions(obj));
return true;
case PersistenceLivenessStatus status:
HandleRecoveryStatus(status);
return true;
}

return false;
}

private void HandleRecoveryStatus(PersistenceLivenessStatus livenessStatus)
{
if(_logInfo)
Expand All @@ -154,16 +147,28 @@ private void HandleRecoveryStatus(PersistenceLivenessStatus livenessStatus)
PublishStatusUpdates();
}

private bool Recreating(object message)
private bool HandleMessages(object message)
{
switch (message)
{
case Terminated t when t.ActorRef.Equals(_probe):
Context.Unwatch(_probe);
_probe = null;
if(_logInfo)
_log.Debug($"Persistence probe terminated. Recreating in {_delay.TotalSeconds} seconds.");
ScheduleProbeRestart();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

return true;

case CreateProbe:
if(_logInfo)
_log.Debug("Persistence probe terminated. Recreating...");
CreateProbe();
_log.Debug("Recreating persistence probe.");

_probe = Context.ActorOf(Props.Create(() => new SuicideProbe(Self, _probeCounter == 0, _id)));
Context.Watch(_probe);
_probe.Tell("hit" + _probeCounter);
_probeCounter++;
return true;

case PersistenceLivenessStatus status:
HandleRecoveryStatus(status);
return true;
Expand All @@ -174,30 +179,19 @@ private bool Recreating(object message)

protected override bool Receive(object message)
{
return Started(message) || HandleSubscriptions(message);
throw new NotImplementedException("Should never hit this line");
}

protected override void PreStart()
{
CreateProbe();
Self.Tell(CreateProbe.Instance);
}

private void CreateProbe()
private void ScheduleProbeRestart()
{
_probe = Context.ActorOf(Props.Create(() => new SuicideProbe(Self, _probeCounter == 0, _id)));
Context.Watch(_probe);

if(_probeCounter == 0)
{
_probe.Tell("hit" + _probeCounter);
}
else
{
Context.System.Scheduler.ScheduleTellOnce(_delay, _probe, "hit" + _probeCounter, Self, _shutdownCancellable);
}
_probeCounter++;
Context.System.Scheduler.ScheduleTellOnce(_delay, Self, CreateProbe.Instance, Self, _shutdownCancellable);
}

private void PublishStatusUpdates()
{
foreach (var sub in _subscribers) sub.Tell(_currentLivenessStatus);
Expand All @@ -220,19 +214,19 @@ internal class SuicideProbe : ReceivePersistentActor
private bool? _persistedSnapshotStore;
private bool? _deletedJournal;
private bool? _deletedSnapshotStore;
private readonly List<Exception> _failures = new List<Exception>();
private readonly List<Exception> _failures = new ();

public SuicideProbe(IActorRef probe, bool firstAttempt, string id)
{
_probe = probe;
_firstAttempt = firstAttempt;
PersistenceId = $"Akka.HealthCheck-{id}";

Recover<string>(str =>
Recover<string>(_ =>
{
_recoveredJournal = true;
});
Recover<SnapshotOffer>(offer =>
Recover<SnapshotOffer>(_ =>
{
_recoveredSnapshotStore = true;
});
Expand All @@ -248,11 +242,11 @@ public SuicideProbe(IActorRef probe, bool firstAttempt, string id)
SaveSnapshot(str);
});

Command<SaveSnapshotSuccess>(save =>
Command<SaveSnapshotSuccess>(_ =>
{
_persistedSnapshotStore = true;
Persist(_message,
s =>
_ =>
{
_persistedJournal = true;
SendRecoveryStatusWhenFinished();
Expand All @@ -266,7 +260,7 @@ public SuicideProbe(IActorRef probe, bool firstAttempt, string id)
_failures.Add(fail.Cause);
_persistedSnapshotStore = false;
Persist(_message,
s =>
_ =>
{
_persistedJournal = true;
SendRecoveryStatusWhenFinished();
Expand Down