Decide to failover based on uptodate info

When we notice the primary node has gone, we need to get up to date information from the other standby nodes if needed.
Aiven-Open · Apr 9, 2024 · 3d43b22 · 3d43b22
1 parent 7c5e53c
commit 3d43b22
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 5 deletions.
diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py
@@ -494,9 +494,11 @@ def consider_failover(self, own_state, master_node, standby_nodes):
                 self.log.warning("Not considering failover, because it's not enabled by configuration")
             elif self.current_master:
                 self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check")
+                self.failover_decision_queue.get(timeout=self.missing_master_from_config_timeout)
                 master_known_to_be_gone = self.current_master in self.known_gone_nodes
                 now = time.monotonic()
                 config_timeout_exceeded = (now - self.cluster_nodes_change_time) >= self.missing_master_from_config_timeout
+
                 if master_known_to_be_gone or config_timeout_exceeded:
                     # we've seen a master at some point in time, but now it's
                     # not reachable or removed from configuration, perform an
@@ -506,14 +508,14 @@ def consider_failover(self, own_state, master_node, standby_nodes):
                     else:
                         reason = "master node is not reachable"
                     self.log.warning("Performing failover decision because %s", reason)
-                    self.do_failover_decision(own_state, standby_nodes)
+                    self.do_failover_decision(standby_nodes)
                     return
             else:
                 # we've never seen a master and more than failover_timeout
                 # seconds have passed since last config load (and start of
                 # connection attempts to other nodes); perform failover
                 self.log.warning("Performing failover decision because no master node was seen in cluster before timeout")
-                self.do_failover_decision(own_state, standby_nodes)
+                self.do_failover_decision(standby_nodes)
                 return
 
         self.check_replication_lag(own_state, standby_nodes)
@@ -569,7 +571,7 @@ def check_replication_lag(self, own_state, standby_nodes):
                 replication_lag,
                 self.replication_lag_failover_timeout,
             )
-            self.do_failover_decision(own_state, standby_nodes)
+            self.do_failover_decision(standby_nodes)
         else:
             self.log.debug(
                 "Replication lag was: %r, other nodes status was: %r",
@@ -617,7 +619,7 @@ def _been_in_contact_with_master_within_failover_timeout(self):
                 return True
         return False
 
-    def do_failover_decision(self, own_state, standby_nodes):
+    def do_failover_decision(self, standby_nodes):
         if self.connected_master_nodes:
             self.log.warning(
                 "We still have some connected masters: %r, not failing over",
@@ -664,7 +666,7 @@ def do_failover_decision(self, own_state, standby_nodes):
             int(total_amount_of_nodes),
         )
 
-        if standby_nodes[furthest_along_instance] == own_state:
+        if furthest_along_instance == self.own_db:
             if self.check_for_maintenance_mode_file():
                 self.log.warning(
                     "Canceling failover even though we were the node the furthest along, since "

diff --git a/test/conftest.py b/test/conftest.py
@@ -30,6 +30,7 @@ def pgl():
     pgl_.cluster_monitor._connect_to_db = Mock()  # pylint: disable=protected-access
     pgl_.create_alert_file = Mock()
     pgl_.execute_external_command = Mock()
+    pgl_.failover_decision_queue = Mock()
     try:
         yield pgl_
     finally: