Skip to content

Commit

Permalink
Merge pull request #121 from Aiven-Open/rdunklau/wait_for_new_state_w…
Browse files Browse the repository at this point in the history
…hen_deciding_failover

Decide to failover based on uptodate info
  • Loading branch information
alexole authored Apr 10, 2024
2 parents 5700ff3 + 3d43b22 commit 8210a4d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
12 changes: 7 additions & 5 deletions pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,9 +494,11 @@ def consider_failover(self, own_state, master_node, standby_nodes):
self.log.warning("Not considering failover, because it's not enabled by configuration")
elif self.current_master:
self.cluster_monitor_check_queue.put("Master is missing, ask for immediate state check")
self.failover_decision_queue.get(timeout=self.missing_master_from_config_timeout)
master_known_to_be_gone = self.current_master in self.known_gone_nodes
now = time.monotonic()
config_timeout_exceeded = (now - self.cluster_nodes_change_time) >= self.missing_master_from_config_timeout

if master_known_to_be_gone or config_timeout_exceeded:
# we've seen a master at some point in time, but now it's
# not reachable or removed from configuration, perform an
Expand All @@ -506,14 +508,14 @@ def consider_failover(self, own_state, master_node, standby_nodes):
else:
reason = "master node is not reachable"
self.log.warning("Performing failover decision because %s", reason)
self.do_failover_decision(own_state, standby_nodes)
self.do_failover_decision(standby_nodes)
return
else:
# we've never seen a master and more than failover_timeout
# seconds have passed since last config load (and start of
# connection attempts to other nodes); perform failover
self.log.warning("Performing failover decision because no master node was seen in cluster before timeout")
self.do_failover_decision(own_state, standby_nodes)
self.do_failover_decision(standby_nodes)
return

self.check_replication_lag(own_state, standby_nodes)
Expand Down Expand Up @@ -569,7 +571,7 @@ def check_replication_lag(self, own_state, standby_nodes):
replication_lag,
self.replication_lag_failover_timeout,
)
self.do_failover_decision(own_state, standby_nodes)
self.do_failover_decision(standby_nodes)
else:
self.log.debug(
"Replication lag was: %r, other nodes status was: %r",
Expand Down Expand Up @@ -617,7 +619,7 @@ def _been_in_contact_with_master_within_failover_timeout(self):
return True
return False

def do_failover_decision(self, own_state, standby_nodes):
def do_failover_decision(self, standby_nodes):
if self.connected_master_nodes:
self.log.warning(
"We still have some connected masters: %r, not failing over",
Expand Down Expand Up @@ -664,7 +666,7 @@ def do_failover_decision(self, own_state, standby_nodes):
int(total_amount_of_nodes),
)

if standby_nodes[furthest_along_instance] == own_state:
if furthest_along_instance == self.own_db:
if self.check_for_maintenance_mode_file():
self.log.warning(
"Canceling failover even though we were the node the furthest along, since "
Expand Down
1 change: 1 addition & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def pgl():
pgl_.cluster_monitor._connect_to_db = Mock() # pylint: disable=protected-access
pgl_.create_alert_file = Mock()
pgl_.execute_external_command = Mock()
pgl_.failover_decision_queue = Mock()
try:
yield pgl_
finally:
Expand Down

0 comments on commit 8210a4d

Please sign in to comment.