Skip to content

Commit

Permalink
pglookout: support explicit failover priorities
Browse files Browse the repository at this point in the history
Support explicit prioritization between instances. This can be
configured via ``failover_priorities`` key, and will be consulted
upon picking up the standby that should do the promotion in cases
where multiple nodes have a matching replication position.

Previously, and also as the current default, the selection was based
on the sorting order of the remote nodes.

The configuration option allows some additional flexibility, and
supports e.g. topologies where we have more favorable and less
desirable standbys in multiple different network locations.
  • Loading branch information
hnousiainen committed Dec 4, 2024
1 parent 60f65b2 commit bfc997d
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 8 deletions.
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,14 @@ over_warning_limit_command and to create a warning file.

Shell command to execute in case the node has deemed itself in need of promotion

``failover_priorities`` (default ``{}``)

Define priority of nodes for promotion, in case there are multiple candidates
with the same replication position. This allows to ensure all pglookout instances
would elect the same standby for promotion, while still allowing for topologies
with e.g. less preferred standbys in secondary network locations. By default,
pglookout uses remote connection ids for the same selection purpose.

``known_gone_nodes`` (default ``[]``)

Lists nodes that are explicitly known to have left the cluster. If the old
Expand Down
33 changes: 25 additions & 8 deletions pglookout/pglookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,19 +643,36 @@ def do_failover_decision(self, standby_nodes):
if not known_replication_positions:
self.log.warning("No known replication positions, canceling failover consideration")
return
# If there are multiple nodes with the same replication positions pick the one with the "highest" name
# to make sure pglookouts running on all standbys make the same decision. The rationale for picking
# the "highest" node is that there's no obvious way for pglookout to decide which of the nodes is
# "best" beyond looking at replication positions, but picking the highest id supports environments
# where nodes are assigned identifiers from an incrementing sequence identifiers and where we want to
# promote the latest and greatest node. In static environments node identifiers can be priority
# numbers, with the highest number being the one that should be preferred.
furthest_along_instance = max(known_replication_positions[max(known_replication_positions)])

# Find the instance that is furthest along.
# If there are multiple nodes with the same replication positions, try to identify one to promote either
# via explicit failover priority configuration or pick the one with the "highest" name by sort order.
# The rationale of this logic is to ensure all participating pglookouts running on all standbys make
# the same decision. The "highest" name works well in environments where nodes are assigned identifiers
# from an incrementing sequence and where we want to promote the latest and greatest node.

# First, find the list of instances that share the more recent replication position
furthest_along_instances = known_replication_positions[max(known_replication_positions)]
# Second, sort them by "instance name"
furthest_along_instances = sorted(furthest_along_instances, reverse=True)
# Third, if we have explicit failover priorities, use those for selecting the to be promoted instance
if "failover_priorities" in self.config:
highest_priority = max([
self.config["failover_priorities"].get(instance, 0)
for instance in furthest_along_instances
])
furthest_along_instances = [
instance
for instance in furthest_along_instances
if self.config["failover_priorities"].get(instance) == highest_priority
]
furthest_along_instance = furthest_along_instances[0]
self.log.warning(
"Node that is furthest along is: %r, all replication positions were: %r",
furthest_along_instance,
sorted(known_replication_positions),
)

total_observers = len(self.connected_observer_nodes) + len(self.disconnected_observer_nodes)
# +1 in the calculation comes from the master node
total_amount_of_nodes = len(standby_nodes) + 1 - len(self.never_promote_these_nodes) + total_observers
Expand Down
84 changes: 84 additions & 0 deletions test/test_lookout.py
Original file line number Diff line number Diff line change
Expand Up @@ -1005,6 +1005,90 @@ def test_standbys_failover_equal_replication_positions(pgl):
assert pgl.execute_external_command.call_count == 1


def test_standbys_failover_equal_replication_positions_with_priorities(pgl):
now = datetime.datetime.utcnow()
_set_instance_cluster_state(
pgl,
instance="192.168.54.183",
pg_last_xlog_receive_location="0/70004D8",
pg_is_in_recovery=True,
connection=True,
replication_time_lag=400.435871,
fetch_time=now,
db_time=now,
conn_info="foobar",
)
_set_instance_cluster_state(
pgl,
instance="192.168.57.180",
pg_last_xlog_receive_location=None,
pg_is_in_recovery=False,
connection=False,
replication_time_lag=0.0,
fetch_time=now - datetime.timedelta(seconds=3600),
db_time=now - datetime.timedelta(seconds=3600),
conn_info="foobar",
)
_set_instance_cluster_state(
pgl,
instance="192.168.63.4",
pg_last_xlog_receive_location="0/70004D8",
pg_is_in_recovery=True,
connection=True,
replication_time_lag=401.104655,
fetch_time=now,
db_time=now,
conn_info="foobar",
)
_set_instance_cluster_state(
pgl,
instance="192.168.62.4",
pg_last_xlog_receive_location="0/70004D8",
pg_is_in_recovery=True,
connection=True,
replication_time_lag=401.104655,
fetch_time=now,
db_time=now,
conn_info="foobar",
)
_set_instance_cluster_state(
pgl,
instance="192.168.52.183",
pg_last_xlog_receive_location="0/70004D8",
pg_is_in_recovery=True,
connection=True,
replication_time_lag=401.104655,
fetch_time=now,
db_time=now,
conn_info="foobar",
)

pgl.current_master = "192.168.57.180"

pgl.config["failover_priorities"] = {
"192.168.54.183": 1000,
"192.168.52.183": 1000,
"192.168.63.4": 0,
}

# This is highest by instance, but lower in priority
pgl.own_db = "192.168.63.4"
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
# This is second highest by instance, but no priority set - it's counted at 0
pgl.own_db = "192.168.62.4"
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
# This node shares highest priority == 1000, but is lower by instance
pgl.own_db = "192.168.52.183"
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 0
# Second lowest by instance, but with priority == 1000
pgl.own_db = "192.168.54.183"
pgl.check_cluster_state()
assert pgl.execute_external_command.call_count == 1


def test_node_map_when_only_observer_sees_master(pgl):
cluster_state = {
"10.255.255.10": {
Expand Down

0 comments on commit bfc997d

Please sign in to comment.