From 3d43b226f699c6f15b128b02e38496e586f39007 Mon Sep 17 00:00:00 2001 From: Ronan Dunklau Date: Tue, 9 Apr 2024 10:51:23 +0200 Subject: [PATCH] Decide to failover based on uptodate info When we notice the primary node has gone, we need to get up to date information from the other standby nodes if needed. --- pglookout/pglookout.py | 12 +++++++----- test/conftest.py | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pglookout/pglookout.py b/pglookout/pglookout.py index 5e92e80..d129e3d 100755 --- a/pglookout/pglookout.py +++ b/pglookout/pglookout.py @@ -494,9 +494,11 @@ def consider_failover(self, own_state, master_node, standby_nodes): self.log.warning("Not considering failover, because it's not enabled by configuration") elif self.current_master: self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check") + self.failover_decision_queue.get(timeout=self.missing_master_from_config_timeout) master_known_to_be_gone = self.current_master in self.known_gone_nodes now = time.monotonic() config_timeout_exceeded = (now - self.cluster_nodes_change_time) >= self.missing_master_from_config_timeout + if master_known_to_be_gone or config_timeout_exceeded: # we've seen a master at some point in time, but now it's # not reachable or removed from configuration, perform an @@ -506,14 +508,14 @@ def consider_failover(self, own_state, master_node, standby_nodes): else: reason = "master node is not reachable" self.log.warning("Performing failover decision because %s", reason) - self.do_failover_decision(own_state, standby_nodes) + self.do_failover_decision(standby_nodes) return else: # we've never seen a master and more than failover_timeout # seconds have passed since last config load (and start of # connection attempts to other nodes); perform failover self.log.warning("Performing failover decision because no master node was seen in cluster before timeout") - self.do_failover_decision(own_state, standby_nodes) + self.do_failover_decision(standby_nodes) return self.check_replication_lag(own_state, standby_nodes) @@ -569,7 +571,7 @@ def check_replication_lag(self, own_state, standby_nodes): replication_lag, self.replication_lag_failover_timeout, ) - self.do_failover_decision(own_state, standby_nodes) + self.do_failover_decision(standby_nodes) else: self.log.debug( "Replication lag was: %r, other nodes status was: %r", @@ -617,7 +619,7 @@ def _been_in_contact_with_master_within_failover_timeout(self): return True return False - def do_failover_decision(self, own_state, standby_nodes): + def do_failover_decision(self, standby_nodes): if self.connected_master_nodes: self.log.warning( "We still have some connected masters: %r, not failing over", @@ -664,7 +666,7 @@ def do_failover_decision(self, own_state, standby_nodes): int(total_amount_of_nodes), ) - if standby_nodes[furthest_along_instance] == own_state: + if furthest_along_instance == self.own_db: if self.check_for_maintenance_mode_file(): self.log.warning( "Canceling failover even though we were the node the furthest along, since " diff --git a/test/conftest.py b/test/conftest.py index 869ad7e..f7daf58 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -30,6 +30,7 @@ def pgl(): pgl_.cluster_monitor._connect_to_db = Mock() # pylint: disable=protected-access pgl_.create_alert_file = Mock() pgl_.execute_external_command = Mock() + pgl_.failover_decision_queue = Mock() try: yield pgl_ finally: