From 1505143b12fd088531dcd8a7a5bbe3c00271e88c Mon Sep 17 00:00:00 2001
From: Stefan Bueringer <sbueringer@gmail.com>
Date: Fri, 21 Jul 2023 07:14:21 +0200
Subject: [PATCH] ClusterCacheTracker: fix accessor deletion on health check
 failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Stefan Büringer buringerst@vmware.com
---
 controllers/remote/cluster_cache_tracker.go | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/controllers/remote/cluster_cache_tracker.go b/controllers/remote/cluster_cache_tracker.go
index 191e09db85eb..6daaa6a94784 100644
--- a/controllers/remote/cluster_cache_tracker.go
+++ b/controllers/remote/cluster_cache_tracker.go
@@ -647,12 +647,16 @@ func (t *ClusterCacheTracker) healthCheckCluster(ctx context.Context, in *health
 	}
 
 	err := wait.PollUntilContextCancel(ctx, in.interval, true, runHealthCheckWithThreshold)
-	// An error returned implies the health check has failed a sufficient number of
-	// times for the cluster to be considered unhealthy
-	// NB. we are ignoring ErrWaitTimeout because this error happens when the channel is close, that in this case
-	// happens when the cache is explicitly stopped.
-	if err != nil && !wait.Interrupted(err) {
+	// An error returned implies the health check has failed a sufficient number of times for the cluster
+	// to be considered unhealthy or the cache was stopped and thus the cache context canceled (we pass the
+	// cache context into wait.PollUntilContextCancel).
+	// NB. Log all errors that occurred even if this error might just be from a cancel of the cache context
+	// when the cache is stopped. Logging an error in this case is not a problem and makes debugging easier.
+	if err != nil {
 		t.log.Error(err, "Error health checking cluster", "Cluster", klog.KRef(in.cluster.Namespace, in.cluster.Name))
-		t.deleteAccessor(ctx, in.cluster)
 	}
+	// Ensure in any case that the accessor is deleted (even if it is a no-op).
+	// NB. It is crucial to ensure the accessor was deleted, so it can be later recreated when the
+	// cluster is reachable again
+	t.deleteAccessor(ctx, in.cluster)
 }