From 09ec9b768168ded9c995383f6662813ffc488274 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Thu, 29 Aug 2019 11:31:47 +0200 Subject: [PATCH] storage: remove learners one by one We don't support removing multiple learners atomically just yet, though \#40268 will fix this (likely in raft). That PR though is obstructed by #40207 because we'll need that first to be able to bump raft again (assuming we don't want to fork). Instead of dealing with all that upfront, let's just not remove multiple learners at once right now so that we can flip the default for atomic replication changes to on. If anyone is still trying to remove only learners atomically, they will fail. However the replicate queues place a high priority on removing stray learners whenever it finds them, so this wouldn't be a permanent problem. Besides, we never add multiple learners at once so it's difficult to get into that state in the first place. Without this commit, TestLearnerAdminRelocateRange fails once atomic replication changes are enabled. Release note: None --- pkg/storage/replica_command.go | 18 +++++++++++++----- pkg/storage/replica_learner_test.go | 3 +++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go index 9a867c8ebfc5..65d90c434635 100644 --- a/pkg/storage/replica_command.go +++ b/pkg/storage/replica_command.go @@ -2086,6 +2086,7 @@ func removeTargetFromSlice( func removeLearners( ctx context.Context, db *client.DB, desc *roachpb.RangeDescriptor, ) (*roachpb.RangeDescriptor, error) { + origDesc := desc learners := desc.Replicas().Learners() if len(learners) == 0 { return desc, nil @@ -2095,13 +2096,20 @@ func removeLearners( targets[i].NodeID = learners[i].NodeID targets[i].StoreID = learners[i].StoreID } + chgs := roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, targets...) log.VEventf(ctx, 2, `removing learner replicas %v from %v`, targets, desc) - newDesc, err := db.AdminChangeReplicas(ctx, desc.StartKey, *desc, - roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, targets...)) - if err != nil { - return nil, errors.Wrapf(err, `removing learners from %s`, desc) + // NB: unroll the removals because at the time of writing, we can't atomically + // remove multiple learners. This will be fixed in: + // + // https://github.com/cockroachdb/cockroach/pull/40268 + for i := range chgs { + var err error + desc, err = db.AdminChangeReplicas(ctx, desc.StartKey, *desc, chgs[i:i+1]) + if err != nil { + return nil, errors.Wrapf(err, `removing learners from %s`, origDesc) + } } - return newDesc, nil + return desc, nil } // adminScatter moves replicas and leaseholders for a selection of ranges. diff --git a/pkg/storage/replica_learner_test.go b/pkg/storage/replica_learner_test.go index af8de787783b..7756b8045c88 100644 --- a/pkg/storage/replica_learner_test.go +++ b/pkg/storage/replica_learner_test.go @@ -783,6 +783,9 @@ func TestLearnerAdminRelocateRange(t *testing.T) { }) defer tc.Stopper().Stop(ctx) + _, err := tc.Conns[0].Exec(`SET CLUSTER SETTING kv.atomic_replication_changes.enabled = true`) + require.NoError(t, err) + scratchStartKey := tc.ScratchRange(t) ltk.withStopAfterLearnerAtomic(func() { _ = tc.AddReplicasOrFatal(t, scratchStartKey, tc.Target(1))