Skip to content

Commit

Permalink
Fix replica unable trigger migration when it received CLUSTER SETSLOT…
Browse files Browse the repository at this point in the history
… in advance

When the primary and replica cluster-allow-replica-migration configuration items are
different, the replica migration does not meet expectations.

Originally: primary not allow, replica allow. In this case, the code will migrate the
replica away, and the primary will keep as an empty primary. However, in #970, we found
a timing issue. For example, if the replica receives CLUSTER SETSLOT first and then receives
the gossip change, it will not trigger the replica-migration.

We perform replica migration explicitly in CLUSTER SETSLOT in this case.

Signed-off-by: Binbin <binloveplay1314@qq.com>
  • Loading branch information
enjoy-binbin committed Sep 11, 2024
1 parent 50c1fe5 commit 3aef09e
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 2 deletions.
20 changes: 20 additions & 0 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -6383,7 +6383,9 @@ void clusterCommandSetSlot(client *c) {
server.cluster->migrating_slots_to[slot] = NULL;
}

clusterNode *my_primary = clusterNodeGetPrimary(myself);
int slot_was_mine = server.cluster->slots[slot] == myself;
int slot_was_my_primary = server.cluster->slots[slot] == my_primary;
clusterDelSlot(slot);
clusterAddSlot(n, slot);

Expand All @@ -6400,6 +6402,24 @@ void clusterCommandSetSlot(client *c) {
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
}

/* If we are a replica and executing the CLUSTER SETSLOT command from
* my primary, and my primary left without slots, we should turn into a
* replica of the new primary. */
if (nodeIsReplica(myself) && slot_was_my_primary && my_primary->numslots == 0 && server.cluster_allow_replica_migration) {
serverLog(LL_NOTICE,
"Lost my last slot during slot migration. Reconfiguring myself "
"as a replica of %.40s (%s) in shard %.40s",
n->name, n->human_nodename, n->shard_id);
/* We are a replica and the client is actually my primary, the following
* clusterSetPrimary will free the client so we need to protect it. */
protectClient(c);
/* We are migrating to a different shard that has a completely different
* replication history, so a full sync is required. */
clusterSetPrimary(n, 1, 1);
unprotectClient(c);
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
}

/* If this node or this node's primary was importing this slot,
* assigning the slot to itself also clears the importing status. */
if ((n == myself || n == myself->replicaof) && server.cluster->importing_slots_from[slot]) {
Expand Down
1 change: 1 addition & 0 deletions src/networking.c
Original file line number Diff line number Diff line change
Expand Up @@ -2559,6 +2559,7 @@ void freeSharedQueryBuf(void) {
*
* * DEBUG RELOAD and similar.
* * When a Lua script is in -BUSY state.
* * A cluster replica doing CLUSTER SETSLOT and doing migration.
*
* So the function will protect the client by doing two things:
*
Expand Down
25 changes: 23 additions & 2 deletions tests/unit/cluster/replica-migration.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,17 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
test_sub_replica "sigstop"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT" {
proc test_cluster_setslot {type} {
test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT - $type" {
R 3 config set cluster-allow-replica-migration no
R 7 config set cluster-allow-replica-migration yes

if {$type == "setslot"} {
# Make R 7 drop the PING message so that we have a higher
# chance to trigger the migration from CLUSTER SETSLOT.
R 7 DEBUG DROP-CLUSTER-PACKET-FILTER 1
}

# Move slot 0 from primary 3 to primary 0.
set addr "[srv 0 host]:[srv 0 port]"
set myid [R 3 CLUSTER MYID]
Expand All @@ -349,6 +355,13 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result"
}

# Wait for R 3 to report that it is an empty replica (cluster-allow-replica-migration no)
wait_for_log_messages -3 {"*I am now an empty primary*"} 0 1000 50

if {$type == "setslot"} {
R 7 DEBUG DROP-CLUSTER-PACKET-FILTER -1
}

# Make sure server 3 lost its replica (server 7) and server 7 becomes a replica of primary 0.
wait_for_condition 1000 50 {
[s -3 role] eq {master} &&
Expand All @@ -361,4 +374,12 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
fail "Server 3 and 7 role response has not changed"
}
}
}

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_cluster_setslot "gossip"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_cluster_setslot "setslot"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

0 comments on commit 3aef09e

Please sign in to comment.