Skip to content

Commit

Permalink
Decide to failover based on uptodate info
Browse files Browse the repository at this point in the history
When we notice the primary node has gone, we need to get up to date
information from the other standby nodes if needed.
  • Loading branch information
rdunklau committed Apr 9, 2024
1 parent 7c5e53c commit 3d43b22
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
12 changes: 7 additions & 5 deletions pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,9 +494,11 @@ def consider_failover(self, own_state, master_node, standby_nodes):
self.log.warning("Not considering failover, because it's not enabled by configuration")
elif self.current_master:
self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check")
self.failover_decision_queue.get(timeout=self.missing_master_from_config_timeout)
master_known_to_be_gone = self.current_master in self.known_gone_nodes
now = time.monotonic()
config_timeout_exceeded = (now - self.cluster_nodes_change_time) >= self.missing_master_from_config_timeout

if master_known_to_be_gone or config_timeout_exceeded:
# we've seen a master at some point in time, but now it's
# not reachable or removed from configuration, perform an
Expand All @@ -506,14 +508,14 @@ def consider_failover(self, own_state, master_node, standby_nodes):
else:
reason = "master node is not reachable"
self.log.warning("Performing failover decision because %s", reason)
self.do_failover_decision(own_state, standby_nodes)
self.do_failover_decision(standby_nodes)
return
else:
# we've never seen a master and more than failover_timeout
# seconds have passed since last config load (and start of
# connection attempts to other nodes); perform failover
self.log.warning("Performing failover decision because no master node was seen in cluster before timeout")
self.do_failover_decision(own_state, standby_nodes)
self.do_failover_decision(standby_nodes)
return

self.check_replication_lag(own_state, standby_nodes)
Expand Down Expand Up @@ -569,7 +571,7 @@ def check_replication_lag(self, own_state, standby_nodes):
replication_lag,
self.replication_lag_failover_timeout,
)
self.do_failover_decision(own_state, standby_nodes)
self.do_failover_decision(standby_nodes)
else:
self.log.debug(
"Replication lag was: %r, other nodes status was: %r",
Expand Down Expand Up @@ -617,7 +619,7 @@ def _been_in_contact_with_master_within_failover_timeout(self):
return True
return False

def do_failover_decision(self, own_state, standby_nodes):
def do_failover_decision(self, standby_nodes):
if self.connected_master_nodes:
self.log.warning(
"We still have some connected masters: %r, not failing over",
Expand Down Expand Up @@ -664,7 +666,7 @@ def do_failover_decision(self, own_state, standby_nodes):
int(total_amount_of_nodes),
)

if standby_nodes[furthest_along_instance] == own_state:
if furthest_along_instance == self.own_db:
if self.check_for_maintenance_mode_file():
self.log.warning(
"Canceling failover even though we were the node the furthest along, since "
Expand Down
1 change: 1 addition & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def pgl():
pgl_.cluster_monitor._connect_to_db = Mock() # pylint: disable=protected-access
pgl_.create_alert_file = Mock()
pgl_.execute_external_command = Mock()
pgl_.failover_decision_queue = Mock()
try:
yield pgl_
finally:
Expand Down

0 comments on commit 3d43b22

Please sign in to comment.