From 82865c7892ade51c8f671a9b006bda7c806707c2 Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Tue, 27 Aug 2019 23:51:00 +0200 Subject: [PATCH] storage: allow atomic removal/addition of multiple LEARNERs When we add/remove more than one voter "atomically", we end up with a range descriptor that has the affected voters as VOTER_{INCOMING,OUTGOING}. Adding/removing multiple learners does not need joint consensus, but etcd/raft does require it on a technicality (not being able to prove that the learner changes don't correspond to voter demotions, basically). We don't have LEARNER_{INCOMING,OUTGOING} nor do we want to clutter the range descriptor further. As a result, we're unable to accurately reflect a joint configuration in which only learners changed via the range descriptor, and thus aren't able to mutate only learners in an atomic replication change. We could argue that this isn't needed, but it's actually a very awkward papercut because for removing nodes, we don't distinguish between learners and voters and so anyone running atomic replication changes needs to make sure they're not accidentally trying to remove multiple learners at once. To lift this restriction, just work around the raft restriction. We make the convention that a transition into a joint raft config that takes place under a descriptor that doesn't reflect a joint config is left right when it is entered (i.e. in the same raft command). This essentially amounts to letting etcd/raft trust us that we are not running demotions, which we are not. Release note: None --- pkg/roachpb/data.go | 83 +++++++++++++------ .../client_atomic_membership_change_test.go | 26 ++++++ .../replica_application_state_machine.go | 23 ++++- pkg/storage/replica_command.go | 2 + 4 files changed, 107 insertions(+), 27 deletions(-) diff --git a/pkg/roachpb/data.go b/pkg/roachpb/data.go index cd2aef01bebb..e57b469f6555 100644 --- a/pkg/roachpb/data.go +++ b/pkg/roachpb/data.go @@ -1336,11 +1336,11 @@ func writeTooOldRetryTimestamp(txn *Transaction, err *WriteTooOldError) hlc.Time // Replicas returns all of the replicas present in the descriptor after this // trigger applies. -func (crt ChangeReplicasTrigger) Replicas() []ReplicaDescriptor { +func (crt ChangeReplicasTrigger) Replicas() ReplicaDescriptors { if crt.Desc != nil { - return crt.Desc.Replicas().All() + return crt.Desc.Replicas() } - return crt.DeprecatedUpdatedReplicas + return MakeReplicaDescriptors(crt.DeprecatedUpdatedReplicas) } // ConfChange returns the configuration change described by the trigger. @@ -1361,7 +1361,7 @@ func confChangeImpl( crt interface { Added() []ReplicaDescriptor Removed() []ReplicaDescriptor - Replicas() []ReplicaDescriptor + Replicas() ReplicaDescriptors alwaysV2() bool }, encodedCtx []byte, @@ -1371,7 +1371,7 @@ func confChangeImpl( var sl []raftpb.ConfChangeSingle checkExists := func(in ReplicaDescriptor) error { - for _, rDesc := range replicas { + for _, rDesc := range replicas.All() { if rDesc.ReplicaID == in.ReplicaID { if a, b := in.GetType(), rDesc.GetType(); a != b { return errors.Errorf("have %s, but descriptor has %s", in, rDesc) @@ -1382,7 +1382,7 @@ func confChangeImpl( return errors.Errorf("%s missing from descriptors %v", in, replicas) } checkNotExists := func(in ReplicaDescriptor) error { - for _, rDesc := range replicas { + for _, rDesc := range replicas.All() { if rDesc.ReplicaID == in.ReplicaID { return errors.Errorf("%s must no longer be present in descriptor", in) } @@ -1445,35 +1445,66 @@ func confChangeImpl( }) } - // Check whether we're entering a joint state. This is the case precisely when - // the resulting descriptors tells us that this is the case. Note that we've - // made sure above that all of the additions/removals are in tune with that - // descriptor already. - var enteringJoint bool - for _, rDesc := range replicas { - switch rDesc.GetType() { - case VOTER_INCOMING, VOTER_OUTGOING: - enteringJoint = true - default: - } - } + // Check whether we're entering a joint state. Note that there is a "gotcha" + // if we carry out multiple changes but none of them affect voters, that is, + // we're adding/removing multiple learners. This does not change the quorum + // requirements, so we *should* be able to get by without going through a + // joint state. In fact, we *have to*, unless we want to add additional + // state to the RangeDescriptor (like a Joint boolean or + // LEARNER_{INCOMING,OUTGOING} replica types). + // + // Unfortunately, etcd/raft *always requires* a joint consensus transition + // when it is handed more than one change. This is because as far as it it + // knows, every addition of a learner could be a demotion of a voter (thus + // changing the quorum size); the configuration change API does not specify + // the config on which the changes are to be applied, and the app needs to + // predictably know whether the change will be joint or not. + // + // However, in CRDB we know the base configuration on which the delta + // applies - it is the config represented by the previous descriptor, and as + // outlined above, in the "learners only" case it never reflects a joint + // configuration in this case. Luckily, we can work around this safely by + // giving etcd/raft the joint transition it needs, but transitioning out of + // it immediately when entering it (in the same entry). More precisely, + // after we call rawNode.ApplyConfChange, if we just moved into a joint + // configuration but our descriptor is not joint, we call ApplyConfChange + // again to transition out of the joint state again. + // + // In summary: + // - if crt.Desc.InAtomicReplicationChange(), then we've changed voters and + // will enter the joint configuration, and will leave it explicitly via + // another replication change txn (see wantLeaveJoint below). + // - if !crt.Desc.InAtomicReplicationChange() but there is more than one + // change, then we're only changing learners and don't need joint consensus, + // but etcd/raft wants it, and we go through the joint state but without + // spending any time in it. This amounts to letting etcd/raft trust us + // that we know what the underlying configuration is. + // - if !crt.Desc.InAtomicReplicationChange() and there is only one change, + // then we're adding/removing a single learner or voter without going + // through a joint transition. + raftEnteringJoint := len(added)+len(removed) > 1 + // If there's an incoming/outgoing voter, then even if there is only + // one change apparently we are running it through a joint + // configuration. We're allowed to do that, though we typically + // don't do it outside of tests. + raftEnteringJoint = raftEnteringJoint || replicas.InAtomicReplicationChange() + + // A trigger that neither adds nor removes anything is how we encode a request + // to leave a joint state. Note that this is only applicable in case 1 above, + // i.e. we're chaning something about the voters. wantLeaveJoint := len(added)+len(removed) == 0 - if !enteringJoint { - if len(added)+len(removed) > 1 { - return nil, errors.Errorf("change requires joint consensus") - } - } else if wantLeaveJoint { + + if raftEnteringJoint && wantLeaveJoint { return nil, errors.Errorf("descriptor enters joint state, but trigger is requesting to leave one") } var cc raftpb.ConfChangeI - - if enteringJoint || crt.alwaysV2() { + if raftEnteringJoint || crt.alwaysV2() { // V2 membership changes, which allow atomic replication changes. We // track the joint state in the range descriptor and thus we need to be // in charge of when to leave the joint state. transition := raftpb.ConfChangeTransitionJointExplicit - if !enteringJoint { + if !raftEnteringJoint { // If we're using V2 just to avoid V1 (and not because we actually // have a change that requires V2), then use an auto transition // which skips the joint state. This is necessary: our descriptor diff --git a/pkg/storage/client_atomic_membership_change_test.go b/pkg/storage/client_atomic_membership_change_test.go index 53bc697b2b85..1a1c74e00446 100644 --- a/pkg/storage/client_atomic_membership_change_test.go +++ b/pkg/storage/client_atomic_membership_change_test.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/kr/pretty" @@ -131,3 +132,28 @@ func TestAtomicReplicationChange(t *testing.T) { // Only a lone voter on s5 should be left over. checkDesc(desc, 5) } + +// TODO(tbg): finish this test, add comments. +func TestAtomicReplicationChangeMultipleLearners(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + knobs, ltk := makeReplicationTestKnobs() + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + db := sqlutils.MakeSQLRunner(tc.ServerConn(0)) + db.Exec(t, `SET CLUSTER SETTING kv.learner_replicas.enabled = true`) + db.Exec(t, `SET CLUSTER SETTING kv.atomic_replication_changes.enabled = true`) + + k := tc.ScratchRange(t) + var desc roachpb.RangeDescriptor + ltk.withStopAfterLearnerAtomic(func() { + desc = tc.AddReplicasOrFatal(t, k, tc.Target(1), tc.Target(2)) + }) + require.Len(t, desc.Replicas().Learners(), 2, desc) + + desc = tc.RemoveReplicasOrFatal(t, k, tc.Target(1), tc.Target(2)) + require.Len(t, desc.Replicas().Learners(), 0, desc) +} diff --git a/pkg/storage/replica_application_state_machine.go b/pkg/storage/replica_application_state_machine.go index 11d95503b665..48bd097c9513 100644 --- a/pkg/storage/replica_application_state_machine.go +++ b/pkg/storage/replica_application_state_machine.go @@ -1005,7 +1005,28 @@ func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *re return nil } return sm.r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) { - raftGroup.ApplyConfChange(cmd.confChange.ConfChangeI) + cc := cmd.confChange.ConfChangeI.AsV2() + raftGroup.ApplyConfChange(cc) + cc.Context = nil // for info below + newCfg := raftGroup.Status().Config + if !cmd.replicatedResult().ChangeReplicas.Desc.Replicas().InAtomicReplicationChange() && len(newCfg.Voters[1]) > 0 { + // If the RawNode entered a joint configuration but the descriptor does not describe one, + // then we have mutated only learners in this change and were only formally forced into + // joint consensus by a restriction in etcd/raft. Transition out of the joint config + // immediately so that the Raft config reflects the descriptor accurately. + // + // Note that the joint state it's in is equivalent to the non-joint config represented by + // the descriptor (since (1 2 3)&&(1 2 3) is equivalent to (1 2 3) alone), though if we + // didn't auto-transition here we'd hit an error the next time we want to change the + // configuration (can't run simple changes when in joint state, can't enter joint + // state when already in joint state, etc). + // + // See the comment within this method for more details: + _ = (*roachpb.ChangeReplicasTrigger)(nil).ConfChange + + log.VEventf(ctx, 1, "transitioning out of joint config %s", newCfg.String()) + raftGroup.ApplyConfChange(raftpb.ConfChangeV2{}) + } return true, nil }) default: diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go index 77071b82c3ad..9223f57c86ee 100644 --- a/pkg/storage/replica_command.go +++ b/pkg/storage/replica_command.go @@ -1078,6 +1078,8 @@ func addLearnerReplicas( // This isn't crazy, we just need to transition out of the joint config // before returning from this method, and it's unclear that it's worth // doing. + // + // WIP(tbg): address the above TODO now. for _, target := range targets { iChgs := []internalReplicationChange{{target: target, typ: internalChangeTypeAddLearner}} var err error