From 15a82902316b59e2cbf8962ac2665e4b6b038f4c Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 20 Jul 2024 03:27:49 +0800
Subject: [PATCH] Optimize failover time when the new primary node is down
 again (#782)

We will not reset failover_auth_time after setting it, this is used
to check auth_timeout and auth_retry_time, but we should at least
reset it after a successful failover.

Let's assume the following scenario:
1. Two replicas initiate an election.
2. Replica 1 is elected as the primary node, and replica 2 does not have
   enough votes.
3. Replica 1 is down, ie the new primary node down again in a short
time.
4. Replica 2 know that the new primary node is down and wants to
initiate
   a failover, but because the failover_auth_time of the previous round
   has not been reset, it needs to wait for it to time out and then wait
for the next retry time, which will take cluster-node-timeout * 4 times,
   this adds a lot of delay.

There is another problem. Like we will set additional random time for
failover_auth_time, such as random 500ms and replicas ranking 1s. If
replica 2 receives PONG from the new primary node before sending the
FAILOVER_AUTH_REQUEST, that is, before the failover_auth_time, it will
change itself to a replica. If the new primary node goes down again at
this time, replica 2 will use the previous failover_auth_time to
initiate
an election instead of going through the logic of random 500ms and
replicas ranking 1s again, which may lead to unexpected consequences
(for example, a low-ranking replica initiates an election and becomes
the new primary node).

That is, we need to reset failover_auth_time at the appropriate time.
When the replica switches to a new primary, we reset it, because the
existing failover_auth_time is already out of date in this case.

---------

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c             |  7 ++++
 tests/support/util.tcl           | 11 +++++-
 tests/unit/cluster/failover2.tcl | 66 ++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/cluster/failover2.tcl

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 6368228eff..5e7dcb00ba 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -5246,6 +5246,13 @@ void clusterSetPrimary(clusterNode *n, int closeSlots) {
     replicationSetPrimary(n->ip, getNodeDefaultReplicationPort(n));
     removeAllNotOwnedShardChannelSubscriptions();
     resetManualFailover();
+
+    if (server.cluster->failover_auth_time) {
+        /* Since we have changed to a new primary node, the previously set
+         * failover_auth_time should no longer be used, whether it is in
+         * progress or timed out. */
+        server.cluster->failover_auth_time = 0;
+    }
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index a8948338ea..2e2c70f205 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -176,7 +176,16 @@ proc verify_log_message {srv_idx pattern from_line} {
     incr from_line
     set result [exec tail -n +$from_line < [srv $srv_idx stdout]]
     if {![string match $pattern $result]} {
-        error "assertion:expected message not found in log file: $pattern"
+        fail "expected message not found in log file: $pattern"
+    }
+}
+
+# verify pattern does not exists in server's stout after a certain line number
+proc verify_no_log_message {srv_idx pattern from_line} {
+    incr from_line
+    set result [exec tail -n +$from_line < [srv $srv_idx stdout]]
+    if {[string match $pattern $result]} {
+        fail "expected message found in log file: $pattern"
     }
 }
 
diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
new file mode 100644
index 0000000000..7bc6a05e95
--- /dev/null
+++ b/tests/unit/cluster/failover2.tcl
@@ -0,0 +1,66 @@
+# Check the basic monitoring and failover capabilities.
+
+start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
+
+    test "Cluster is up" {
+        wait_for_cluster_state ok
+    }
+
+    test "Cluster is writable" {
+        cluster_write_test [srv 0 port]
+    }
+
+    set paused_pid [srv 0 pid]
+    test "Killing one primary node" {
+        pause_process $paused_pid
+    }
+
+    test "Wait for failover" {
+        wait_for_condition 1000 50 {
+            [s -3 role] == "master" || [s -6 role] == "master"
+        } else {
+            fail "No failover detected"
+        }
+    }
+
+    test "Killing the new primary node" {
+        if {[s -3 role] == "master"} {
+            set replica_to_be_primary -6
+            set paused_pid2 [srv -3 pid]
+        } else {
+            set replica_to_be_primary -3
+            set paused_pid2 [srv -6 pid]
+        }
+        pause_process $paused_pid2
+    }
+
+    test "Cluster should eventually be up again" {
+        for {set j 0} {$j < [llength $::servers]} {incr j} {
+            if {[process_is_paused [srv -$j pid]]} continue
+            wait_for_condition 1000 50 {
+                [CI $j cluster_state] eq "ok"
+            } else {
+                fail "Cluster node $j cluster_state:[CI $j cluster_state]"
+            }
+        }
+    }
+
+    test "wait for new failover" {
+        wait_for_condition 1000 50 {
+            [s $replica_to_be_primary role] == "master"
+        } else {
+            fail "No failover detected"
+        }
+    }
+
+    test "Restarting the previously killed primary nodes" {
+        resume_process $paused_pid
+        resume_process $paused_pid2
+    }
+
+    test "Make sure there is no failover timeout" {
+        verify_no_log_message -3 "*Failover attempt expired*" 0
+        verify_no_log_message -6 "*Failover attempt expired*" 0
+    }
+
+} ;# start_cluster