Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix replica unable trigger migration when it received CLUSTER SETSLOT in advance #981

Merged
merged 11 commits into from
Sep 13, 2024
16 changes: 12 additions & 4 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -6390,20 +6390,28 @@ void clusterCommandSetSlot(client *c) {
server.cluster->migrating_slots_to[slot] = NULL;
}

int slot_was_mine = server.cluster->slots[slot] == myself;
clusterNode *my_primary = clusterNodeGetPrimary(myself);
int slot_was_mine = server.cluster->slots[slot] == my_primary;
clusterDelSlot(slot);
clusterAddSlot(n, slot);

/* If we are a primary left without slots, we should turn into a
* replica of the new primary. */
if (slot_was_mine && n != myself && myself->numslots == 0 && server.cluster_allow_replica_migration) {
/* If replica migration is allowed, check if the primary of this shard
* loses its last slot and the shard becomes empty. In this case, we
* should turn into a replica of the new primary. */
if (server.cluster_allow_replica_migration && slot_was_mine && my_primary->numslots == 0) {
serverAssert(n != my_primary);
serverLog(LL_NOTICE,
"Lost my last slot during slot migration. Reconfiguring myself "
"as a replica of %.40s (%s) in shard %.40s",
n->name, n->human_nodename, n->shard_id);
/* `c` is the primary client if `myself` is a replica, prevent it
* from being freed by clusterSetPrimary.
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
* since we will free it soon. */
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
if (nodeIsReplica(myself)) protectClient(c);
/* We are migrating to a different shard that has a completely different
* replication history, so a full sync is required. */
clusterSetPrimary(n, 1, 1);
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
if (nodeIsReplica(myself)) unprotectClient(c);
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
}

enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
1 change: 1 addition & 0 deletions src/networking.c
Original file line number Diff line number Diff line change
Expand Up @@ -2560,6 +2560,7 @@ void freeSharedQueryBuf(void) {
*
* * DEBUG RELOAD and similar.
* * When a Lua script is in -BUSY state.
* * A cluster replica doing CLUSTER SETSLOT and doing migration.
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
*
* So the function will protect the client by doing two things:
*
Expand Down
25 changes: 23 additions & 2 deletions tests/unit/cluster/replica-migration.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,17 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
test_sub_replica "sigstop"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT" {
proc test_cluster_setslot {type} {
test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT - $type" {
R 3 config set cluster-allow-replica-migration no
R 7 config set cluster-allow-replica-migration yes

if {$type == "setslot"} {
# Make R 7 drop the PING message so that we have a higher
# chance to trigger the migration from CLUSTER SETSLOT.
R 7 DEBUG DROP-CLUSTER-PACKET-FILTER 1
}

# Move slot 0 from primary 3 to primary 0.
set addr "[srv 0 host]:[srv 0 port]"
set myid [R 3 CLUSTER MYID]
Expand All @@ -349,6 +355,13 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result"
}

# Wait for R 3 to report that it is an empty replica (cluster-allow-replica-migration no)
wait_for_log_messages -3 {"*I am now an empty primary*"} 0 1000 50

if {$type == "setslot"} {
R 7 DEBUG DROP-CLUSTER-PACKET-FILTER -1
}

# Make sure server 3 lost its replica (server 7) and server 7 becomes a replica of primary 0.
wait_for_condition 1000 50 {
[s -3 role] eq {master} &&
Expand All @@ -361,4 +374,12 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
fail "Server 3 and 7 role response has not changed"
}
}
}

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_cluster_setslot "gossip"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_cluster_setslot "setslot"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster
Loading