From db05d645e90a7a653c114a13b4832e96517d1375 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 26 Aug 2024 15:27:51 +0800
Subject: [PATCH 01/10] Fix reconfiguring sub-replica causing data loss when
 myself change shard_id

In this case, sender is myself's primary, when executing updateShardId,
not only the sender's shard_id is updated, but also the shard_id of
myself is updated, casuing the subsequent areInSameShard check, that is,
the full_sync_required check to fail.

This one follow #885 and closes #942.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 4f1e09d4ef..a638a7aa70 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3162,6 +3162,8 @@ int clusterProcessPacket(clusterLink *link) {
 
     /* PING, PONG, MEET: process config information. */
     if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || type == CLUSTERMSG_TYPE_MEET) {
+        int myself_shard_id_changed = 0;
+
         serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type),
                   link->node ? link->node->name : "NULL");
 
@@ -3312,10 +3314,16 @@ int clusterProcessPacket(clusterLink *link) {
                     if (sender->replicaof) clusterNodeRemoveReplica(sender->replicaof, sender);
                     serverLog(LL_NOTICE, "Node %.40s (%s) is now a replica of node %.40s (%s) in shard %.40s",
                               sender->name, sender->human_nodename, sender_claimed_primary->name,
-                              sender_claimed_primary->human_nodename, sender->shard_id);
+                              sender_claimed_primary->human_nodename, sender_claimed_primary->shard_id);
                     clusterNodeAddReplica(sender_claimed_primary, sender);
                     sender->replicaof = sender_claimed_primary;
 
+                    /* The later updateShardId may change myself shard_id, and we
+                     * need to remember whether this change has occurred. */
+                    if (sender_claimed_primary->shard_id && myself != sender && myself->replicaof == sender) {
+                        myself_shard_id_changed = 1;
+                    }
+
                     /* Update the shard_id when a replica is connected to its
                      * primary in the very first time. */
                     updateShardId(sender, sender_claimed_primary->shard_id);
@@ -3398,7 +3406,14 @@ int clusterProcessPacket(clusterLink *link) {
              * so we can try a psync. */
             serverLog(LL_NOTICE, "I'm a sub-replica! Reconfiguring myself as a replica of %.40s from %.40s",
                       myself->replicaof->replicaof->name, myself->replicaof->name);
-            clusterSetPrimary(myself->replicaof->replicaof, 1, !areInSameShard(myself->replicaof->replicaof, myself));
+            if (myself_shard_id_changed) {
+                /* If myself shard_id changes during the clusterProcessPacket, myself
+                 * will not be able to psync with the new shard. */
+                clusterSetPrimary(myself->replicaof->replicaof, 1, 1);
+            } else {
+                int are_in_same_shard = areInSameShard(myself->replicaof->replicaof, myself);
+                clusterSetPrimary(myself->replicaof->replicaof, 1, !are_in_same_shard);
+            }
             clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
         }
 

From 18dae3eabeaf08e4b3be36f195624f064ef0cbe8 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 26 Aug 2024 15:36:41 +0800
Subject: [PATCH 02/10] fix build warning

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index a638a7aa70..6009929396 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3320,7 +3320,7 @@ int clusterProcessPacket(clusterLink *link) {
 
                     /* The later updateShardId may change myself shard_id, and we
                      * need to remember whether this change has occurred. */
-                    if (sender_claimed_primary->shard_id && myself != sender && myself->replicaof == sender) {
+                    if (myself != sender && myself->replicaof == sender) {
                         myself_shard_id_changed = 1;
                     }
 

From 361f6175be2e4c9747f4db7a16679078c9e438bf Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 28 Aug 2024 11:21:26 +0800
Subject: [PATCH 03/10] pull the chain replication reduction logic before the
 update of shard_id

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c                     |  53 ++++------
 tests/unit/cluster/replica-migration.tcl | 125 ++++++++++++++++++++++-
 2 files changed, 145 insertions(+), 33 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 6009929396..e15f465a6a 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3162,8 +3162,6 @@ int clusterProcessPacket(clusterLink *link) {
 
     /* PING, PONG, MEET: process config information. */
     if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || type == CLUSTERMSG_TYPE_MEET) {
-        int myself_shard_id_changed = 0;
-
         serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type),
                   link->node ? link->node->name : "NULL");
 
@@ -3318,10 +3316,27 @@ int clusterProcessPacket(clusterLink *link) {
                     clusterNodeAddReplica(sender_claimed_primary, sender);
                     sender->replicaof = sender_claimed_primary;
 
-                    /* The later updateShardId may change myself shard_id, and we
-                     * need to remember whether this change has occurred. */
-                    if (myself != sender && myself->replicaof == sender) {
-                        myself_shard_id_changed = 1;
+                    /* Currently this is the only place where replicaof state can be updated on
+                     * this function, since updateShardId may update myself shard_id and caused
+                     * areInSameShard check failed. Explicitly check for a replication loop before
+                     * attempting the replication chain folding logic. */
+                    if (myself->replicaof && myself->replicaof->replicaof && myself->replicaof->replicaof != myself) {
+                        /* Safeguard against sub-replicas.
+                         *
+                         * A replica's primary can turn itself into a replica if its last slot
+                         * is removed. If no other node takes over the slot, there is nothing
+                         * else to trigger replica migration. In this case, they are not in the
+                         * same shard, so a full sync is required.
+                         *
+                         * Or a replica's primary can turn itself into a replica of its other
+                         * replica during a failover. In this case, they are in the same shard,
+                         * so we can try a psync. */
+                        serverLog(LL_NOTICE, "I'm a sub-replica! Reconfiguring myself as a replica of %.40s from %.40s",
+                                  myself->replicaof->replicaof->name, myself->replicaof->name);
+                        int are_in_same_shard = areInSameShard(myself->replicaof->replicaof, myself);
+                        clusterSetPrimary(myself->replicaof->replicaof, 1, !are_in_same_shard);
+                        /* We will add the CLUSTER_TODO_SAVE_CONFIG flag when we exit the if statement. */
+                        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
                     }
 
                     /* Update the shard_id when a replica is connected to its
@@ -3391,32 +3406,6 @@ int clusterProcessPacket(clusterLink *link) {
             }
         }
 
-        /* Explicitly check for a replication loop before attempting the replication
-         * chain folding logic. */
-        if (myself->replicaof && myself->replicaof->replicaof && myself->replicaof->replicaof != myself) {
-            /* Safeguard against sub-replicas.
-             *
-             * A replica's primary can turn itself into a replica if its last slot
-             * is removed. If no other node takes over the slot, there is nothing
-             * else to trigger replica migration. In this case, they are not in the
-             * same shard, so a full sync is required.
-             *
-             * Or a replica's primary can turn itself into a replica of its other
-             * replica during a failover. In this case, they are in the same shard,
-             * so we can try a psync. */
-            serverLog(LL_NOTICE, "I'm a sub-replica! Reconfiguring myself as a replica of %.40s from %.40s",
-                      myself->replicaof->replicaof->name, myself->replicaof->name);
-            if (myself_shard_id_changed) {
-                /* If myself shard_id changes during the clusterProcessPacket, myself
-                 * will not be able to psync with the new shard. */
-                clusterSetPrimary(myself->replicaof->replicaof, 1, 1);
-            } else {
-                int are_in_same_shard = areInSameShard(myself->replicaof->replicaof, myself);
-                clusterSetPrimary(myself->replicaof->replicaof, 1, !are_in_same_shard);
-            }
-            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
-        }
-
         /* If our config epoch collides with the sender's try to fix
          * the problem. */
         if (sender && nodeIsPrimary(myself) && nodeIsPrimary(sender) &&
diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl
index 06e9d70ee7..599f193638 100644
--- a/tests/unit/cluster/replica-migration.tcl
+++ b/tests/unit/cluster/replica-migration.tcl
@@ -190,7 +190,7 @@ proc test_nonempty_replica {type} {
         if {$type == "sigstop"} {
             resume_process $primary0_pid
 
-            # Waiting the old primary go online and become a replica.
+            # Wait for the old primary to go online and become a replica.
             wait_for_condition 1000 50 {
                 [s 0 role] eq {slave}
             } else {
@@ -208,6 +208,129 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
     test_nonempty_replica "sigstop"
 } my_slot_allocation cluster_allocate_replicas ;# start_cluster
 
+proc test_sub_replica {type} {
+    test "Sub-replica reports zero repl offset and rank, and fails to win election - $type" {
+        # Write some data to primary 0, slot 1, make a small repl_offset.
+        for {set i 0} {$i < 1024} {incr i} {
+            R 0 incr key_991803
+        }
+        assert_equal {1024} [R 0 get key_991803]
+
+        # Write some data to primary 3, slot 0, make a big repl_offset.
+        for {set i 0} {$i < 10240} {incr i} {
+            R 3 incr key_977613
+        }
+        assert_equal {10240} [R 3 get key_977613]
+
+        R 3 config set cluster-replica-validity-factor 0
+        R 7 config set cluster-replica-validity-factor 0
+        R 3 config set cluster-allow-replica-migration yes
+        R 7 config set cluster-allow-replica-migration no
+
+        # Record the current primary node, server 3 will be migrated later.
+        # And server 7 will become a sub-replica that also will be migrated later.
+        set 3_old_role_response [R 3 role]
+        set 7_old_role_response [R 7 role]
+
+        # 10s, make sure primary 0 will hang in the save.
+        R 0 config set rdb-key-save-delay 100000000
+
+        # Move slot 0 from primary 3 to primary 0.
+        set addr "[srv 0 host]:[srv 0 port]"
+        set myid [R 3 CLUSTER MYID]
+        set code [catch {
+            exec src/valkey-cli {*}[valkeycli_tls_config "./tests"] --cluster rebalance $addr --cluster-weight $myid=0
+        } result]
+        if {$code != 0} {
+            fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result"
+        }
+
+        # Wait for server 3 and server 7 role response to change.
+        wait_for_condition 1000 50 {
+            [R 3 role] ne $3_old_role_response &&
+            [R 7 role] ne $7_old_role_response
+        } else {
+            puts "R 3 role: [R 3 role]"
+            puts "R 7 role: [R 7 role]"
+            fail "Server 3 and 7 role response has not changed"
+        }
+
+        # Make sure server 3 becomes a replica of primary 0.
+        set new_primary_ip [lindex [R 3 role] 1]
+        set new_primary_port [lindex [R 3 role] 2]
+        assert_equal [s -3 role] {slave}
+        assert_equal "$new_primary_ip:$new_primary_port" $addr
+
+        # And server 7 becomes a replica of primary 0.
+        set new_primary_ip [lindex [R 7 role] 1]
+        set new_primary_port [lindex [R 7 role] 2]
+        assert_equal [s -7 role] {slave}
+        assert_equal "$new_primary_ip:$new_primary_port" $addr
+
+        # Make sure server 7 got a sub-replica log.
+        verify_log_message -7 "*I'm a sub-replica!*" 0
+
+        if {$type == "shutdown"} {
+            # Shutdown primary 0.
+            catch {R 0 shutdown nosave}
+        } elseif {$type == "sigstop"} {
+            # Pause primary 0.
+            set primary0_pid [s 0 process_id]
+            pause_process $primary0_pid
+        }
+
+        # Wait for the replica to become a primary, and make sure
+        # the other primary become a replica.
+        wait_for_condition 1000 50 {
+            [s -4 role] eq {master} &&
+            [s -3 role] eq {slave} &&
+            [s -7 role] eq {slave}
+        } else {
+            puts "s -4 role: [s -4 role]"
+            puts "s -3 role: [s -3 role]"
+            puts "s -7 role: [s -7 role]"
+            fail "Failover does not happened"
+        }
+
+        # Make sure the offset of server 3 / 7 is 0.
+        verify_log_message -3 "*Start of election*offset 0*" 0
+        verify_log_message -7 "*Start of election*offset 0*" 0
+
+        # Make sure the key exists and is consistent.
+        R 3 readonly
+        R 7 readonly
+        wait_for_condition 1000 50 {
+            [R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
+            [R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
+            [R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
+        } else {
+            puts "R 3: [R 3 keys *]"
+            puts "R 4: [R 4 keys *]"
+            puts "R 7: [R 7 keys *]"
+            fail "Key not consistent"
+        }
+
+        if {$type == "sigstop"} {
+            resume_process $primary0_pid
+
+            # Wait for the old primary to go online and become a replica.
+            wait_for_condition 1000 50 {
+                [s 0 role] eq {slave}
+            } else {
+                fail "The old primary was not converted into replica"
+            }
+        }
+    }
+}
+
+start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
+    test_sub_replica "shutdown"
+} my_slot_allocation cluster_allocate_replicas ;# start_cluster
+
+start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
+    test_sub_replica "sigstop"
+} my_slot_allocation cluster_allocate_replicas ;# start_cluster
+
 start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
     test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT" {
         R 3 config set cluster-allow-replica-migration no

From ed589dbd07434526b3148fd74ac9a218385f176f Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 28 Aug 2024 15:06:08 +0800
Subject: [PATCH 04/10] fix timing issue

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/unit/cluster/replica-migration.tcl | 31 +++++++++---------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl
index 599f193638..c3ef42ae00 100644
--- a/tests/unit/cluster/replica-migration.tcl
+++ b/tests/unit/cluster/replica-migration.tcl
@@ -11,6 +11,14 @@ proc my_slot_allocation {masters replicas} {
     R [expr $masters-1] cluster addslots 0
 }
 
+proc get_my_primary_peer {srv_idx} {
+    set role_response [R $srv_idx role]
+    set primary_ip [lindex $role_response 1]
+    set primary_port [lindex $role_response 2]
+    set primary_peer "$primary_ip:$primary_port"
+    return $primary_peer
+}
+
 proc test_migrated_replica {type} {
     test "Migrated replica reports zero repl offset and rank, and fails to win election - $type" {
         # Write some data to primary 0, slot 1, make a small repl_offset.
@@ -227,11 +235,6 @@ proc test_sub_replica {type} {
         R 3 config set cluster-allow-replica-migration yes
         R 7 config set cluster-allow-replica-migration no
 
-        # Record the current primary node, server 3 will be migrated later.
-        # And server 7 will become a sub-replica that also will be migrated later.
-        set 3_old_role_response [R 3 role]
-        set 7_old_role_response [R 7 role]
-
         # 10s, make sure primary 0 will hang in the save.
         R 0 config set rdb-key-save-delay 100000000
 
@@ -245,28 +248,16 @@ proc test_sub_replica {type} {
             fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result"
         }
 
-        # Wait for server 3 and server 7 role response to change.
+        # Make sure server 3 and server 7 becomes a replica of primary 0.
         wait_for_condition 1000 50 {
-            [R 3 role] ne $3_old_role_response &&
-            [R 7 role] ne $7_old_role_response
+            [get_my_primary_peer 3] eq $addr &&
+            [get_my_primary_peer 7] eq $addr
         } else {
             puts "R 3 role: [R 3 role]"
             puts "R 7 role: [R 7 role]"
             fail "Server 3 and 7 role response has not changed"
         }
 
-        # Make sure server 3 becomes a replica of primary 0.
-        set new_primary_ip [lindex [R 3 role] 1]
-        set new_primary_port [lindex [R 3 role] 2]
-        assert_equal [s -3 role] {slave}
-        assert_equal "$new_primary_ip:$new_primary_port" $addr
-
-        # And server 7 becomes a replica of primary 0.
-        set new_primary_ip [lindex [R 7 role] 1]
-        set new_primary_port [lindex [R 7 role] 2]
-        assert_equal [s -7 role] {slave}
-        assert_equal "$new_primary_ip:$new_primary_port" $addr
-
         # Make sure server 7 got a sub-replica log.
         verify_log_message -7 "*I'm a sub-replica!*" 0
 

From 187352f6fa09cfafe715964b70603e6fd4c69b6a Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 28 Aug 2024 15:44:29 +0800
Subject: [PATCH 05/10] fix cluster down timing issue

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/unit/cluster/replica-migration.tcl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl
index c3ef42ae00..0d232b2765 100644
--- a/tests/unit/cluster/replica-migration.tcl
+++ b/tests/unit/cluster/replica-migration.tcl
@@ -287,6 +287,18 @@ proc test_sub_replica {type} {
         verify_log_message -3 "*Start of election*offset 0*" 0
         verify_log_message -7 "*Start of election*offset 0*" 0
 
+        # Wait for the cluster to be ok.
+        wait_for_condition 1000 50 {
+            [CI 3 cluster_state] eq "ok" &&
+            [CI 4 cluster_state] eq "ok" &&
+            [CI 7 cluster_state] eq "ok"
+        } else {
+            puts "R 3: [R 3 cluster info]"
+            puts "R 4: [R 4 cluster info]"
+            puts "R 7: [R 7 cluster info]"
+            fail "Cluster is down"
+        }
+
         # Make sure the key exists and is consistent.
         R 3 readonly
         R 7 readonly

From f7c4367191ad413850202d85d683f44cbe8e9fe4 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 29 Aug 2024 14:42:23 +0800
Subject: [PATCH 06/10] Update src/cluster_legacy.c

Co-authored-by: Ping Xie <pingxie@outlook.com>
Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index e15f465a6a..0cc0c6dd79 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3316,10 +3316,20 @@ int clusterProcessPacket(clusterLink *link) {
                     clusterNodeAddReplica(sender_claimed_primary, sender);
                     sender->replicaof = sender_claimed_primary;
 
-                    /* Currently this is the only place where replicaof state can be updated on
-                     * this function, since updateShardId may update myself shard_id and caused
-                     * areInSameShard check failed. Explicitly check for a replication loop before
-                     * attempting the replication chain folding logic. */
+                    /* The chain reduction logic requires correctly establishing the replication relationship.
+                     * A key decision when designating a new primary for 'myself' is determining whether
+                     * 'myself' and the new primary belong to the same shard, which would imply shared
+                     * replication history and allow safe partial synchronization (psync).
+                     *
+                     * This decision hinges on the shard_id, a per-node property that helps verify if the
+                     * two nodes share the same replication history. It's critical not to update 'myself's
+                     * shard_id prematurely during this process. Doing so could incorrectly associate
+                     * 'myself' with the sender's shard_id, leading the subsequent clusterSetPrimary call
+                     * to falsely assume that 'myself' and the new primary have been in the same shard.
+                     * This mistake could result in data loss by incorrectly permitting a psync.
+                     *
+                     * Therefore, it's essential to delay any shard_id updates until after the replication
+                     * relationship has been properly established and verified. */
                     if (myself->replicaof && myself->replicaof->replicaof && myself->replicaof->replicaof != myself) {
                         /* Safeguard against sub-replicas.
                          *

From 87dcb22e78c6184da5bf9d35ceabae5de4e74186 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 29 Aug 2024 14:43:24 +0800
Subject: [PATCH 07/10] Update src/cluster_legacy.c

Co-authored-by: Ping Xie <pingxie@outlook.com>
Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 0cc0c6dd79..aeacb0270e 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3345,8 +3345,7 @@ int clusterProcessPacket(clusterLink *link) {
                                   myself->replicaof->replicaof->name, myself->replicaof->name);
                         int are_in_same_shard = areInSameShard(myself->replicaof->replicaof, myself);
                         clusterSetPrimary(myself->replicaof->replicaof, 1, !are_in_same_shard);
-                        /* We will add the CLUSTER_TODO_SAVE_CONFIG flag when we exit the if statement. */
-                        clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
+                        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
                     }
 
                     /* Update the shard_id when a replica is connected to its

From 7788208e1a554fee1b5b3d3dd5d503e023989a85 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 29 Aug 2024 14:43:30 +0800
Subject: [PATCH 08/10] Update src/cluster_legacy.c

Co-authored-by: Ping Xie <pingxie@outlook.com>
Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index aeacb0270e..5a341681e6 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3343,8 +3343,7 @@ int clusterProcessPacket(clusterLink *link) {
                          * so we can try a psync. */
                         serverLog(LL_NOTICE, "I'm a sub-replica! Reconfiguring myself as a replica of %.40s from %.40s",
                                   myself->replicaof->replicaof->name, myself->replicaof->name);
-                        int are_in_same_shard = areInSameShard(myself->replicaof->replicaof, myself);
-                        clusterSetPrimary(myself->replicaof->replicaof, 1, !are_in_same_shard);
+                        clusterSetPrimary(myself->replicaof->replicaof, 1, !areInSameShard(myself->replicaof->replicaof, myself));
                         clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
                     }
 

From 44c84f5ef799a7586079c3b86c250d5a2cd3bc80 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 29 Aug 2024 14:49:36 +0800
Subject: [PATCH 09/10] fix format

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 5a341681e6..8ee99a3d2c 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3319,10 +3319,10 @@ int clusterProcessPacket(clusterLink *link) {
                     /* The chain reduction logic requires correctly establishing the replication relationship.
                      * A key decision when designating a new primary for 'myself' is determining whether
                      * 'myself' and the new primary belong to the same shard, which would imply shared
-                     * replication history and allow safe partial synchronization (psync).
+                     * replication history and allow a safe partial synchronization (psync).
                      *
                      * This decision hinges on the shard_id, a per-node property that helps verify if the
-                     * two nodes share the same replication history. It's critical not to update 'myself's
+                     * two nodes share the same replication history. It's critical not to update myself's
                      * shard_id prematurely during this process. Doing so could incorrectly associate
                      * 'myself' with the sender's shard_id, leading the subsequent clusterSetPrimary call
                      * to falsely assume that 'myself' and the new primary have been in the same shard.
@@ -3343,8 +3343,10 @@ int clusterProcessPacket(clusterLink *link) {
                          * so we can try a psync. */
                         serverLog(LL_NOTICE, "I'm a sub-replica! Reconfiguring myself as a replica of %.40s from %.40s",
                                   myself->replicaof->replicaof->name, myself->replicaof->name);
-                        clusterSetPrimary(myself->replicaof->replicaof, 1, !areInSameShard(myself->replicaof->replicaof, myself));
-                        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);
+                        clusterSetPrimary(myself->replicaof->replicaof, 1,
+                                          !areInSameShard(myself->replicaof->replicaof, myself));
+                        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE |
+                                             CLUSTER_TODO_FSYNC_CONFIG);
                     }
 
                     /* Update the shard_id when a replica is connected to its

From f5cd6587dcdaebb44acf19ebbbfaaa9524f64df7 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 29 Aug 2024 15:03:12 +0800
Subject: [PATCH 10/10] fix format

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 8ee99a3d2c..21bdd09919 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3345,7 +3345,7 @@ int clusterProcessPacket(clusterLink *link) {
                                   myself->replicaof->replicaof->name, myself->replicaof->name);
                         clusterSetPrimary(myself->replicaof->replicaof, 1,
                                           !areInSameShard(myself->replicaof->replicaof, myself));
-                        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE |
+                        clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE |
                                              CLUSTER_TODO_FSYNC_CONFIG);
                     }